sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -13,10 +13,11 @@
13
13
  # ==============================================================================
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
+ import logging
16
17
  import time
17
18
  import uuid
18
19
  from dataclasses import dataclass
19
- from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
20
+ from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypeAlias, Union
20
21
 
21
22
  from openai.types.responses import (
22
23
  ResponseFunctionToolCall,
@@ -36,6 +37,11 @@ from pydantic import (
36
37
  model_validator,
37
38
  )
38
39
  from typing_extensions import Literal
40
+ from xgrammar import StructuralTag
41
+
42
+ from sglang.utils import convert_json_schema_to_str
43
+
44
+ logger = logging.getLogger(__name__)
39
45
 
40
46
  DEFAULT_MODEL_NAME = "default"
41
47
 
@@ -123,12 +129,23 @@ class StructuresResponseFormat(BaseModel):
123
129
  end: str
124
130
 
125
131
 
126
- class StructuralTagResponseFormat(BaseModel):
132
+ # NOTE(dark): keep this for backward compatibility
133
+ class LegacyStructuralTagResponseFormat(BaseModel):
127
134
  type: Literal["structural_tag"]
128
135
  structures: List[StructuresResponseFormat]
129
136
  triggers: List[str]
130
137
 
131
138
 
139
+ StructuralTagResponseFormat: TypeAlias = Union[
140
+ LegacyStructuralTagResponseFormat, StructuralTag
141
+ ]
142
+
143
+ ToolCallConstraint: TypeAlias = Union[
144
+ Tuple[Literal["structural_tag"], StructuralTagResponseFormat],
145
+ Tuple[Literal["json_schema"], Any], # json_schema can be dict/str/None
146
+ ]
147
+
148
+
132
149
  class FileRequest(BaseModel):
133
150
  # https://platform.openai.com/docs/api-reference/files/create
134
151
  file: bytes # The File object (not file name) to be uploaded
@@ -187,7 +204,10 @@ class BatchResponse(BaseModel):
187
204
  class CompletionRequest(BaseModel):
188
205
  # Ordered by official OpenAI API documentation
189
206
  # https://platform.openai.com/docs/api-reference/completions/create
190
- model: str = DEFAULT_MODEL_NAME
207
+ model: str = Field(
208
+ default=DEFAULT_MODEL_NAME,
209
+ description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
210
+ )
191
211
  prompt: Union[List[int], List[List[int]], str, List[str]]
192
212
  best_of: Optional[int] = None
193
213
  echo: bool = False
@@ -216,12 +236,15 @@ class CompletionRequest(BaseModel):
216
236
  ebnf: Optional[str] = None
217
237
  repetition_penalty: float = 1.0
218
238
  stop_token_ids: Optional[List[int]] = None
239
+ stop_regex: Optional[Union[str, List[str]]] = None
219
240
  no_stop_trim: bool = False
220
241
  ignore_eos: bool = False
221
242
  skip_special_tokens: bool = True
222
243
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
223
244
  session_params: Optional[Dict] = None
224
245
  response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
246
+ custom_params: Optional[Dict] = None
247
+ custom_logit_processor: Optional[str] = None
225
248
 
226
249
  # For PD disaggregation
227
250
  bootstrap_host: Optional[Union[List[str], str]] = None
@@ -423,7 +446,10 @@ class ChatCompletionRequest(BaseModel):
423
446
  # Ordered by official OpenAI API documentation
424
447
  # https://platform.openai.com/docs/api-reference/chat/create
425
448
  messages: List[ChatCompletionMessageParam]
426
- model: str = DEFAULT_MODEL_NAME
449
+ model: str = Field(
450
+ default=DEFAULT_MODEL_NAME,
451
+ description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
452
+ )
427
453
  frequency_penalty: float = 0.0
428
454
  logit_bias: Optional[Dict[str, float]] = None
429
455
  logprobs: bool = False
@@ -445,8 +471,8 @@ class ChatCompletionRequest(BaseModel):
445
471
  stop: Optional[Union[str, List[str]]] = None
446
472
  stream: bool = False
447
473
  stream_options: Optional[StreamOptions] = None
448
- temperature: float = 0.7
449
- top_p: float = 1.0
474
+ temperature: Optional[float] = None
475
+ top_p: Optional[float] = None
450
476
  user: Optional[str] = None
451
477
  tools: Optional[List[Tool]] = Field(default=None, examples=[None])
452
478
  tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
@@ -461,6 +487,52 @@ class ChatCompletionRequest(BaseModel):
461
487
  "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
462
488
  )
463
489
 
490
+ # Extra parameters for SRT backend only and will be ignored by OpenAI models.
491
+ top_k: Optional[int] = None
492
+ min_p: Optional[float] = None
493
+ min_tokens: int = 0
494
+ regex: Optional[str] = None
495
+ ebnf: Optional[str] = None
496
+ repetition_penalty: Optional[float] = None
497
+ stop_token_ids: Optional[List[int]] = None
498
+ stop_regex: Optional[Union[str, List[str]]] = None
499
+ no_stop_trim: bool = False
500
+ ignore_eos: bool = False
501
+ continue_final_message: bool = False
502
+ skip_special_tokens: bool = True
503
+ lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
504
+ session_params: Optional[Dict] = None
505
+ separate_reasoning: bool = True
506
+ stream_reasoning: bool = True
507
+ chat_template_kwargs: Optional[Dict] = None
508
+
509
+ # Custom logit processor for advanced sampling control
510
+ custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
511
+ custom_params: Optional[Dict] = None
512
+
513
+ # For request id
514
+ rid: Optional[Union[List[str], str]] = None
515
+ # Extra key for classifying the request (e.g. cache_salt)
516
+ extra_key: Optional[Union[List[str], str]] = None
517
+ # Cache salt for request caching
518
+ cache_salt: Optional[Union[List[str], str]] = None
519
+ # Priority for the request
520
+ priority: Optional[int] = None
521
+
522
+ # For PD disaggregation
523
+ bootstrap_host: Optional[Union[List[str], str]] = None
524
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
525
+ bootstrap_room: Optional[Union[List[int], int]] = None
526
+
527
+ # OpenAI/SGLang default sampling parameters
528
+ _DEFAULT_SAMPLING_PARAMS = {
529
+ "temperature": 1.0,
530
+ "top_p": 1.0,
531
+ "top_k": -1,
532
+ "min_p": 0.0,
533
+ "repetition_penalty": 1.0,
534
+ }
535
+
464
536
  @model_validator(mode="before")
465
537
  @classmethod
466
538
  def set_tool_choice_default(cls, values):
@@ -531,37 +603,83 @@ class ChatCompletionRequest(BaseModel):
531
603
 
532
604
  return values
533
605
 
534
- # Extra parameters for SRT backend only and will be ignored by OpenAI models.
535
- top_k: int = -1
536
- min_p: float = 0.0
537
- min_tokens: int = 0
538
- regex: Optional[str] = None
539
- ebnf: Optional[str] = None
540
- repetition_penalty: float = 1.0
541
- stop_token_ids: Optional[List[int]] = None
542
- no_stop_trim: bool = False
543
- ignore_eos: bool = False
544
- continue_final_message: bool = False
545
- skip_special_tokens: bool = True
546
- lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
547
- session_params: Optional[Dict] = None
548
- separate_reasoning: bool = True
549
- stream_reasoning: bool = True
550
- chat_template_kwargs: Optional[Dict] = None
606
+ def to_sampling_params(
607
+ self,
608
+ stop: List[str],
609
+ model_generation_config: Dict[str, Any],
610
+ tool_call_constraint: Optional[ToolCallConstraint] = None,
611
+ ) -> Dict[str, Any]:
612
+ """
613
+ Convert request to sampling parameters.
614
+ Priority: user value > model generation_config > OpenAI defaults
615
+ """
616
+
617
+ def get_param(param_name: str):
618
+ value = getattr(self, param_name)
619
+ if value is None:
620
+ return model_generation_config.get(
621
+ param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
622
+ )
623
+ return value
624
+
625
+ sampling_params = {
626
+ "temperature": get_param("temperature"),
627
+ "max_new_tokens": self.max_tokens or self.max_completion_tokens,
628
+ "min_new_tokens": self.min_tokens,
629
+ "stop": stop,
630
+ "stop_token_ids": self.stop_token_ids,
631
+ "stop_regex": self.stop_regex,
632
+ "top_p": get_param("top_p"),
633
+ "top_k": get_param("top_k"),
634
+ "min_p": get_param("min_p"),
635
+ "presence_penalty": self.presence_penalty,
636
+ "frequency_penalty": self.frequency_penalty,
637
+ "repetition_penalty": get_param("repetition_penalty"),
638
+ "regex": self.regex,
639
+ "ebnf": self.ebnf,
640
+ "n": self.n,
641
+ "no_stop_trim": self.no_stop_trim,
642
+ "ignore_eos": self.ignore_eos,
643
+ "skip_special_tokens": self.skip_special_tokens,
644
+ "logit_bias": self.logit_bias,
645
+ "custom_params": self.custom_params,
646
+ }
551
647
 
552
- # For request id
553
- rid: Optional[Union[List[str], str]] = None
554
- # Extra key for classifying the request (e.g. cache_salt)
555
- extra_key: Optional[Union[List[str], str]] = None
556
- # Cache salt for request caching
557
- cache_salt: Optional[Union[List[str], str]] = None
558
- # Priority for the request
559
- priority: Optional[int] = None
648
+ if self.response_format and self.response_format.type == "json_schema":
649
+ sampling_params["json_schema"] = convert_json_schema_to_str(
650
+ self.response_format.json_schema.schema_
651
+ )
652
+ elif self.response_format and self.response_format.type == "json_object":
653
+ sampling_params["json_schema"] = '{"type": "object"}'
654
+ elif self.response_format and self.response_format.type == "structural_tag":
655
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
656
+ self.response_format.model_dump(by_alias=True)
657
+ )
560
658
 
561
- # For PD disaggregation
562
- bootstrap_host: Optional[Union[List[str], str]] = None
563
- bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
564
- bootstrap_room: Optional[Union[List[int], int]] = None
659
+ # Check if there are already existing output constraints
660
+ has_existing_constraints = (
661
+ sampling_params.get("regex")
662
+ or sampling_params.get("ebnf")
663
+ or sampling_params.get("structural_tag")
664
+ or sampling_params.get("json_schema")
665
+ )
666
+
667
+ if tool_call_constraint and has_existing_constraints:
668
+ logger.warning("Constrained decoding is not compatible with tool calls.")
669
+ elif tool_call_constraint:
670
+ constraint_type, constraint_value = tool_call_constraint
671
+ if constraint_type == "structural_tag":
672
+ sampling_params[constraint_type] = convert_json_schema_to_str(
673
+ constraint_value.model_dump(by_alias=True)
674
+ )
675
+ elif constraint_type == "json_schema":
676
+ sampling_params[constraint_type] = convert_json_schema_to_str(
677
+ constraint_value # type: ignore
678
+ )
679
+ else:
680
+ sampling_params[constraint_type] = constraint_value
681
+
682
+ return sampling_params
565
683
 
566
684
 
567
685
  class ChatMessage(BaseModel):
@@ -668,6 +786,37 @@ class EmbeddingObject(BaseModel):
668
786
  object: str = "embedding"
669
787
 
670
788
 
789
+ ClassifyInput = Union[str, List[str], List[int]]
790
+
791
+
792
+ class ClassifyRequest(BaseModel):
793
+ # OpenAI-compatible classification request
794
+ model: str = DEFAULT_MODEL_NAME
795
+ input: ClassifyInput
796
+ user: Optional[str] = None
797
+
798
+ # The request id.
799
+ rid: Optional[Union[List[str], str]] = None
800
+ # Priority for the request
801
+ priority: Optional[int] = None
802
+
803
+
804
+ class ClassifyData(BaseModel):
805
+ index: int
806
+ label: str
807
+ probs: List[float]
808
+ num_classes: int
809
+
810
+
811
+ class ClassifyResponse(BaseModel):
812
+ id: str
813
+ object: str = "list"
814
+ created: int
815
+ model: str
816
+ data: List[ClassifyData]
817
+ usage: UsageInfo
818
+
819
+
671
820
  class EmbeddingResponse(BaseModel):
672
821
  data: List[EmbeddingObject]
673
822
  model: str
@@ -711,12 +860,51 @@ class RerankResponse(BaseModel):
711
860
  meta_info: Optional[dict] = None
712
861
 
713
862
 
863
+ class TokenizeRequest(BaseModel):
864
+ """Request schema for the /tokenize endpoint."""
865
+
866
+ model: str = DEFAULT_MODEL_NAME
867
+ prompt: Union[str, List[str]]
868
+ add_special_tokens: bool = Field(
869
+ default=True,
870
+ description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
871
+ )
872
+
873
+
874
+ class TokenizeResponse(BaseModel):
875
+ """Response schema for the /tokenize endpoint."""
876
+
877
+ tokens: Union[List[int], List[List[int]]]
878
+ count: Union[int, List[int]]
879
+ max_model_len: int
880
+
881
+
882
+ class DetokenizeRequest(BaseModel):
883
+ """Request schema for the /detokenize endpoint."""
884
+
885
+ model: str = DEFAULT_MODEL_NAME
886
+ tokens: Union[List[int], List[List[int]]]
887
+ skip_special_tokens: bool = Field(
888
+ default=True,
889
+ description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
890
+ )
891
+
892
+
893
+ class DetokenizeResponse(BaseModel):
894
+ """Response schema for the /detokenize endpoint."""
895
+
896
+ text: Union[str, List[str]]
897
+
898
+
714
899
  OpenAIServingRequest = Union[
715
900
  ChatCompletionRequest,
716
901
  CompletionRequest,
717
902
  EmbeddingRequest,
903
+ ClassifyRequest,
718
904
  ScoringRequest,
719
905
  V1RerankReqInput,
906
+ TokenizeRequest,
907
+ DetokenizeRequest,
720
908
  ]
721
909
 
722
910
 
@@ -924,7 +1112,7 @@ class ResponsesResponse(BaseModel):
924
1112
  Union[
925
1113
  ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
926
1114
  ]
927
- ]
1115
+ ],
928
1116
  ) -> bool:
929
1117
  if not items:
930
1118
  return False
@@ -1014,7 +1202,7 @@ class MessageProcessingResult:
1014
1202
  video_data: Optional[Any]
1015
1203
  modalities: List[str]
1016
1204
  stop: List[str]
1017
- tool_call_constraint: Optional[Any] = None
1205
+ tool_call_constraint: Optional[ToolCallConstraint] = None
1018
1206
 
1019
1207
 
1020
1208
  class ToolCallProcessingResult(NamedTuple):
@@ -4,8 +4,9 @@ import json
4
4
  import logging
5
5
  import uuid
6
6
  from abc import ABC, abstractmethod
7
- from typing import TYPE_CHECKING, Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
8
8
 
9
+ import orjson
9
10
  from fastapi import HTTPException, Request
10
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
11
12
 
@@ -34,6 +35,52 @@ class OpenAIServingBase(ABC):
34
35
  else None
35
36
  )
36
37
 
38
+ def _parse_model_parameter(self, model: str) -> Tuple[str, Optional[str]]:
39
+ """Parse 'base-model:adapter-name' syntax to extract LoRA adapter.
40
+
41
+ Returns (base_model, adapter_name) or (model, None) if no colon present.
42
+ """
43
+ if ":" not in model:
44
+ return model, None
45
+
46
+ # Split on first colon only to handle model paths with multiple colons
47
+ parts = model.split(":", 1)
48
+ base_model = parts[0].strip()
49
+ adapter_name = parts[1].strip() or None
50
+
51
+ return base_model, adapter_name
52
+
53
+ def _resolve_lora_path(
54
+ self,
55
+ request_model: str,
56
+ explicit_lora_path: Optional[Union[str, List[Optional[str]]]],
57
+ ) -> Optional[Union[str, List[Optional[str]]]]:
58
+ """Resolve LoRA adapter with priority: model parameter > explicit lora_path.
59
+
60
+ Returns adapter name or None. Supports both single values and lists (batches).
61
+ """
62
+ _, adapter_from_model = self._parse_model_parameter(request_model)
63
+
64
+ # Model parameter adapter takes precedence
65
+ if adapter_from_model is not None:
66
+ return adapter_from_model
67
+
68
+ # Fall back to explicit lora_path
69
+ return explicit_lora_path
70
+
71
+ def _validate_lora_enabled(self, adapter_name: str) -> None:
72
+ """Check that LoRA is enabled before attempting to use an adapter.
73
+
74
+ Raises ValueError with actionable guidance if --enable-lora flag is missing.
75
+ Adapter existence is validated later by TokenizerManager.lora_registry.
76
+ """
77
+ if not self.tokenizer_manager.server_args.enable_lora:
78
+ raise ValueError(
79
+ f"LoRA adapter '{adapter_name}' was requested, but LoRA is not enabled. "
80
+ "Please launch the server with --enable-lora flag and preload adapters "
81
+ "using --lora-paths or /load_lora_adapter endpoint."
82
+ )
83
+
37
84
  async def handle_request(
38
85
  self, request: OpenAIServingRequest, raw_request: Request
39
86
  ) -> Union[Any, StreamingResponse, ErrorResponse]:
@@ -197,7 +244,7 @@ class OpenAIServingBase(ABC):
197
244
  )
198
245
  try:
199
246
  raw_labels = (
200
- json.loads(raw_request.headers.get(header))
247
+ orjson.loads(raw_request.headers.get(header))
201
248
  if raw_request and raw_request.headers.get(header)
202
249
  else None
203
250
  )
@@ -7,6 +7,7 @@ import time
7
7
  import uuid
8
8
  from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
9
9
 
10
+ import orjson
10
11
  from fastapi import Request
11
12
  from fastapi.responses import ORJSONResponse, StreamingResponse
12
13
  from jsonschema import Draft202012Validator, SchemaError
@@ -44,7 +45,6 @@ from sglang.srt.managers.io_struct import GenerateReqInput
44
45
  from sglang.srt.parser.conversation import generate_chat_conv
45
46
  from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
46
47
  from sglang.srt.parser.reasoning_parser import ReasoningParser
47
- from sglang.utils import convert_json_schema_to_str
48
48
 
49
49
  if TYPE_CHECKING:
50
50
  from sglang.srt.managers.template_manager import TemplateManager
@@ -66,6 +66,15 @@ class OpenAIServingChat(OpenAIServingBase):
66
66
  self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
67
67
  self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
68
68
 
69
+ # Get default sampling parameters from model's generation config
70
+ self.default_sampling_params = (
71
+ self.tokenizer_manager.model_config.get_default_sampling_params()
72
+ )
73
+ if self.default_sampling_params:
74
+ logger.info(
75
+ f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
76
+ )
77
+
69
78
  def _request_id_prefix(self) -> str:
70
79
  return "chatcmpl-"
71
80
 
@@ -137,10 +146,10 @@ class OpenAIServingChat(OpenAIServingBase):
137
146
  processed_messages = self._process_messages(request, is_multimodal)
138
147
 
139
148
  # Build sampling parameters
140
- sampling_params = self._build_sampling_params(
141
- request,
142
- processed_messages.stop,
143
- processed_messages.tool_call_constraint,
149
+ sampling_params = request.to_sampling_params(
150
+ stop=processed_messages.stop,
151
+ model_generation_config=self.default_sampling_params,
152
+ tool_call_constraint=processed_messages.tool_call_constraint,
144
153
  )
145
154
 
146
155
  # Handle single vs multiple requests
@@ -155,6 +164,17 @@ class OpenAIServingChat(OpenAIServingBase):
155
164
  # Extract custom labels from raw request headers
156
165
  custom_labels = self.extract_custom_labels(raw_request)
157
166
 
167
+ # Resolve LoRA adapter from model parameter or explicit lora_path
168
+ lora_path = self._resolve_lora_path(request.model, request.lora_path)
169
+ if lora_path:
170
+ first_adapter = (
171
+ lora_path
172
+ if isinstance(lora_path, str)
173
+ else next((a for a in lora_path if a), None)
174
+ )
175
+ if first_adapter:
176
+ self._validate_lora_enabled(first_adapter)
177
+
158
178
  adapted_request = GenerateReqInput(
159
179
  **prompt_kwargs,
160
180
  image_data=processed_messages.image_data,
@@ -167,7 +187,7 @@ class OpenAIServingChat(OpenAIServingBase):
167
187
  stream=request.stream,
168
188
  return_text_in_logprobs=True,
169
189
  modalities=processed_messages.modalities,
170
- lora_path=request.lora_path,
190
+ lora_path=lora_path,
171
191
  bootstrap_host=request.bootstrap_host,
172
192
  bootstrap_port=request.bootstrap_port,
173
193
  bootstrap_room=request.bootstrap_room,
@@ -176,6 +196,7 @@ class OpenAIServingChat(OpenAIServingBase):
176
196
  extra_key=self._compute_extra_key(request),
177
197
  priority=request.priority,
178
198
  custom_labels=custom_labels,
199
+ custom_logit_processor=request.custom_logit_processor,
179
200
  )
180
201
 
181
202
  return adapted_request, request
@@ -277,7 +298,7 @@ class OpenAIServingChat(OpenAIServingBase):
277
298
  if "arguments" in item["function"] and isinstance(
278
299
  item["function"]["arguments"], str
279
300
  ):
280
- item["function"]["arguments"] = json.loads(
301
+ item["function"]["arguments"] = orjson.loads(
281
302
  item["function"]["arguments"]
282
303
  )
283
304
 
@@ -410,72 +431,6 @@ class OpenAIServingChat(OpenAIServingBase):
410
431
  stop=stop,
411
432
  )
412
433
 
413
- def _build_sampling_params(
414
- self,
415
- request: ChatCompletionRequest,
416
- stop: List[str],
417
- tool_call_constraint: Optional[Any],
418
- ) -> Dict[str, Any]:
419
- """Build sampling parameters for the request"""
420
-
421
- sampling_params = {
422
- "temperature": request.temperature,
423
- "max_new_tokens": request.max_tokens or request.max_completion_tokens,
424
- "min_new_tokens": request.min_tokens,
425
- "stop": stop,
426
- "stop_token_ids": request.stop_token_ids,
427
- "top_p": request.top_p,
428
- "top_k": request.top_k,
429
- "min_p": request.min_p,
430
- "presence_penalty": request.presence_penalty,
431
- "frequency_penalty": request.frequency_penalty,
432
- "repetition_penalty": request.repetition_penalty,
433
- "regex": request.regex,
434
- "ebnf": request.ebnf,
435
- "n": request.n,
436
- "no_stop_trim": request.no_stop_trim,
437
- "ignore_eos": request.ignore_eos,
438
- "skip_special_tokens": request.skip_special_tokens,
439
- "logit_bias": request.logit_bias,
440
- }
441
-
442
- if request.response_format and request.response_format.type == "json_schema":
443
- sampling_params["json_schema"] = convert_json_schema_to_str(
444
- request.response_format.json_schema.schema_
445
- )
446
- elif request.response_format and request.response_format.type == "json_object":
447
- sampling_params["json_schema"] = '{"type": "object"}'
448
- elif (
449
- request.response_format and request.response_format.type == "structural_tag"
450
- ):
451
- sampling_params["structural_tag"] = convert_json_schema_to_str(
452
- request.response_format.model_dump(by_alias=True)
453
- )
454
-
455
- # Check if there are already existing output constraints
456
- has_existing_constraints = (
457
- sampling_params.get("regex")
458
- or sampling_params.get("ebnf")
459
- or sampling_params.get("structural_tag")
460
- or sampling_params.get("json_schema")
461
- )
462
-
463
- if tool_call_constraint and has_existing_constraints:
464
- logger.warning("Constrained decoding is not compatible with tool calls.")
465
- elif tool_call_constraint:
466
- constraint_type, constraint_value = tool_call_constraint
467
- if constraint_type == "structural_tag":
468
- sampling_params[constraint_type] = convert_json_schema_to_str(
469
- constraint_value.model_dump(by_alias=True)
470
- )
471
- elif constraint_type == "json_schema":
472
- sampling_params[constraint_type] = convert_json_schema_to_str(
473
- constraint_value
474
- )
475
- else:
476
- sampling_params[constraint_type] = constraint_value
477
- return sampling_params
478
-
479
434
  async def _handle_streaming_request(
480
435
  self,
481
436
  adapted_request: GenerateReqInput,
@@ -918,7 +873,7 @@ class OpenAIServingChat(OpenAIServingBase):
918
873
  finish_reason["matched"] = None
919
874
  try:
920
875
  # For required tool choice, we expect a JSON array of tool calls
921
- tool_call_data = json.loads(text)
876
+ tool_call_data = orjson.loads(text)
922
877
  tool_calls = []
923
878
  for i, tool in enumerate(tool_call_data):
924
879
  # Create a ToolCallItem from the JSON data