sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -4,8 +4,9 @@ import json
4
4
  import logging
5
5
  import uuid
6
6
  from abc import ABC, abstractmethod
7
- from typing import TYPE_CHECKING, Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
8
8
 
9
+ import orjson
9
10
  from fastapi import HTTPException, Request
10
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
11
12
 
@@ -34,6 +35,52 @@ class OpenAIServingBase(ABC):
34
35
  else None
35
36
  )
36
37
 
38
+ def _parse_model_parameter(self, model: str) -> Tuple[str, Optional[str]]:
39
+ """Parse 'base-model:adapter-name' syntax to extract LoRA adapter.
40
+
41
+ Returns (base_model, adapter_name) or (model, None) if no colon present.
42
+ """
43
+ if ":" not in model:
44
+ return model, None
45
+
46
+ # Split on first colon only to handle model paths with multiple colons
47
+ parts = model.split(":", 1)
48
+ base_model = parts[0].strip()
49
+ adapter_name = parts[1].strip() or None
50
+
51
+ return base_model, adapter_name
52
+
53
+ def _resolve_lora_path(
54
+ self,
55
+ request_model: str,
56
+ explicit_lora_path: Optional[Union[str, List[Optional[str]]]],
57
+ ) -> Optional[Union[str, List[Optional[str]]]]:
58
+ """Resolve LoRA adapter with priority: model parameter > explicit lora_path.
59
+
60
+ Returns adapter name or None. Supports both single values and lists (batches).
61
+ """
62
+ _, adapter_from_model = self._parse_model_parameter(request_model)
63
+
64
+ # Model parameter adapter takes precedence
65
+ if adapter_from_model is not None:
66
+ return adapter_from_model
67
+
68
+ # Fall back to explicit lora_path
69
+ return explicit_lora_path
70
+
71
+ def _validate_lora_enabled(self, adapter_name: str) -> None:
72
+ """Check that LoRA is enabled before attempting to use an adapter.
73
+
74
+ Raises ValueError with actionable guidance if --enable-lora flag is missing.
75
+ Adapter existence is validated later by TokenizerManager.lora_registry.
76
+ """
77
+ if not self.tokenizer_manager.server_args.enable_lora:
78
+ raise ValueError(
79
+ f"LoRA adapter '{adapter_name}' was requested, but LoRA is not enabled. "
80
+ "Please launch the server with --enable-lora flag and preload adapters "
81
+ "using --lora-paths or /load_lora_adapter endpoint."
82
+ )
83
+
37
84
  async def handle_request(
38
85
  self, request: OpenAIServingRequest, raw_request: Request
39
86
  ) -> Union[Any, StreamingResponse, ErrorResponse]:
@@ -197,7 +244,7 @@ class OpenAIServingBase(ABC):
197
244
  )
198
245
  try:
199
246
  raw_labels = (
200
- json.loads(raw_request.headers.get(header))
247
+ orjson.loads(raw_request.headers.get(header))
201
248
  if raw_request and raw_request.headers.get(header)
202
249
  else None
203
250
  )
@@ -7,6 +7,7 @@ import time
7
7
  import uuid
8
8
  from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
9
9
 
10
+ import orjson
10
11
  from fastapi import Request
11
12
  from fastapi.responses import ORJSONResponse, StreamingResponse
12
13
  from jsonschema import Draft202012Validator, SchemaError
@@ -44,7 +45,6 @@ from sglang.srt.managers.io_struct import GenerateReqInput
44
45
  from sglang.srt.parser.conversation import generate_chat_conv
45
46
  from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
46
47
  from sglang.srt.parser.reasoning_parser import ReasoningParser
47
- from sglang.utils import convert_json_schema_to_str
48
48
 
49
49
  if TYPE_CHECKING:
50
50
  from sglang.srt.managers.template_manager import TemplateManager
@@ -66,6 +66,15 @@ class OpenAIServingChat(OpenAIServingBase):
66
66
  self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
67
67
  self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
68
68
 
69
+ # Get default sampling parameters from model's generation config
70
+ self.default_sampling_params = (
71
+ self.tokenizer_manager.model_config.get_default_sampling_params()
72
+ )
73
+ if self.default_sampling_params:
74
+ logger.info(
75
+ f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
76
+ )
77
+
69
78
  def _request_id_prefix(self) -> str:
70
79
  return "chatcmpl-"
71
80
 
@@ -137,10 +146,10 @@ class OpenAIServingChat(OpenAIServingBase):
137
146
  processed_messages = self._process_messages(request, is_multimodal)
138
147
 
139
148
  # Build sampling parameters
140
- sampling_params = self._build_sampling_params(
141
- request,
142
- processed_messages.stop,
143
- processed_messages.tool_call_constraint,
149
+ sampling_params = request.to_sampling_params(
150
+ stop=processed_messages.stop,
151
+ model_generation_config=self.default_sampling_params,
152
+ tool_call_constraint=processed_messages.tool_call_constraint,
144
153
  )
145
154
 
146
155
  # Handle single vs multiple requests
@@ -155,6 +164,17 @@ class OpenAIServingChat(OpenAIServingBase):
155
164
  # Extract custom labels from raw request headers
156
165
  custom_labels = self.extract_custom_labels(raw_request)
157
166
 
167
+ # Resolve LoRA adapter from model parameter or explicit lora_path
168
+ lora_path = self._resolve_lora_path(request.model, request.lora_path)
169
+ if lora_path:
170
+ first_adapter = (
171
+ lora_path
172
+ if isinstance(lora_path, str)
173
+ else next((a for a in lora_path if a), None)
174
+ )
175
+ if first_adapter:
176
+ self._validate_lora_enabled(first_adapter)
177
+
158
178
  adapted_request = GenerateReqInput(
159
179
  **prompt_kwargs,
160
180
  image_data=processed_messages.image_data,
@@ -167,7 +187,7 @@ class OpenAIServingChat(OpenAIServingBase):
167
187
  stream=request.stream,
168
188
  return_text_in_logprobs=True,
169
189
  modalities=processed_messages.modalities,
170
- lora_path=request.lora_path,
190
+ lora_path=lora_path,
171
191
  bootstrap_host=request.bootstrap_host,
172
192
  bootstrap_port=request.bootstrap_port,
173
193
  bootstrap_room=request.bootstrap_room,
@@ -176,6 +196,7 @@ class OpenAIServingChat(OpenAIServingBase):
176
196
  extra_key=self._compute_extra_key(request),
177
197
  priority=request.priority,
178
198
  custom_labels=custom_labels,
199
+ custom_logit_processor=request.custom_logit_processor,
179
200
  )
180
201
 
181
202
  return adapted_request, request
@@ -277,7 +298,7 @@ class OpenAIServingChat(OpenAIServingBase):
277
298
  if "arguments" in item["function"] and isinstance(
278
299
  item["function"]["arguments"], str
279
300
  ):
280
- item["function"]["arguments"] = json.loads(
301
+ item["function"]["arguments"] = orjson.loads(
281
302
  item["function"]["arguments"]
282
303
  )
283
304
 
@@ -410,72 +431,6 @@ class OpenAIServingChat(OpenAIServingBase):
410
431
  stop=stop,
411
432
  )
412
433
 
413
- def _build_sampling_params(
414
- self,
415
- request: ChatCompletionRequest,
416
- stop: List[str],
417
- tool_call_constraint: Optional[Any],
418
- ) -> Dict[str, Any]:
419
- """Build sampling parameters for the request"""
420
-
421
- sampling_params = {
422
- "temperature": request.temperature,
423
- "max_new_tokens": request.max_tokens or request.max_completion_tokens,
424
- "min_new_tokens": request.min_tokens,
425
- "stop": stop,
426
- "stop_token_ids": request.stop_token_ids,
427
- "top_p": request.top_p,
428
- "top_k": request.top_k,
429
- "min_p": request.min_p,
430
- "presence_penalty": request.presence_penalty,
431
- "frequency_penalty": request.frequency_penalty,
432
- "repetition_penalty": request.repetition_penalty,
433
- "regex": request.regex,
434
- "ebnf": request.ebnf,
435
- "n": request.n,
436
- "no_stop_trim": request.no_stop_trim,
437
- "ignore_eos": request.ignore_eos,
438
- "skip_special_tokens": request.skip_special_tokens,
439
- "logit_bias": request.logit_bias,
440
- }
441
-
442
- if request.response_format and request.response_format.type == "json_schema":
443
- sampling_params["json_schema"] = convert_json_schema_to_str(
444
- request.response_format.json_schema.schema_
445
- )
446
- elif request.response_format and request.response_format.type == "json_object":
447
- sampling_params["json_schema"] = '{"type": "object"}'
448
- elif (
449
- request.response_format and request.response_format.type == "structural_tag"
450
- ):
451
- sampling_params["structural_tag"] = convert_json_schema_to_str(
452
- request.response_format.model_dump(by_alias=True)
453
- )
454
-
455
- # Check if there are already existing output constraints
456
- has_existing_constraints = (
457
- sampling_params.get("regex")
458
- or sampling_params.get("ebnf")
459
- or sampling_params.get("structural_tag")
460
- or sampling_params.get("json_schema")
461
- )
462
-
463
- if tool_call_constraint and has_existing_constraints:
464
- logger.warning("Constrained decoding is not compatible with tool calls.")
465
- elif tool_call_constraint:
466
- constraint_type, constraint_value = tool_call_constraint
467
- if constraint_type == "structural_tag":
468
- sampling_params[constraint_type] = convert_json_schema_to_str(
469
- constraint_value.model_dump(by_alias=True)
470
- )
471
- elif constraint_type == "json_schema":
472
- sampling_params[constraint_type] = convert_json_schema_to_str(
473
- constraint_value
474
- )
475
- else:
476
- sampling_params[constraint_type] = constraint_value
477
- return sampling_params
478
-
479
434
  async def _handle_streaming_request(
480
435
  self,
481
436
  adapted_request: GenerateReqInput,
@@ -918,7 +873,7 @@ class OpenAIServingChat(OpenAIServingBase):
918
873
  finish_reason["matched"] = None
919
874
  try:
920
875
  # For required tool choice, we expect a JSON array of tool calls
921
- tool_call_data = json.loads(text)
876
+ tool_call_data = orjson.loads(text)
922
877
  tool_calls = []
923
878
  for i, tool in enumerate(tool_call_data):
924
879
  # Create a ToolCallItem from the JSON data
@@ -0,0 +1,204 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import time
5
+ import uuid
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from fastapi import Request
11
+ from fastapi.responses import ORJSONResponse
12
+
13
+ from sglang.srt.entrypoints.openai.protocol import (
14
+ ClassifyRequest,
15
+ ClassifyResponse,
16
+ ErrorResponse,
17
+ )
18
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
19
+ from sglang.srt.managers.io_struct import EmbeddingReqInput
20
+
21
+ if TYPE_CHECKING:
22
+ from sglang.srt.managers.template_manager import TemplateManager
23
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class OpenAIServingClassify(OpenAIServingBase):
29
+ """Handler for v1/classify requests"""
30
+
31
+ def __init__(
32
+ self,
33
+ tokenizer_manager: TokenizerManager,
34
+ template_manager: TemplateManager,
35
+ ):
36
+ super().__init__(tokenizer_manager)
37
+ self.template_manager = template_manager
38
+ self.id2label = self._get_id2label_mapping()
39
+ self.model_name = (
40
+ self.tokenizer_manager.served_model_name
41
+ if self.tokenizer_manager.served_model_name
42
+ else self.tokenizer_manager.server_args.model_path
43
+ )
44
+ if not self.id2label:
45
+ raise ValueError("id2label mapping is missing")
46
+
47
+ def _request_id_prefix(self) -> str:
48
+ return "classify-"
49
+
50
+ def _convert_to_internal_request(
51
+ self,
52
+ request: ClassifyRequest,
53
+ raw_request: Request = None,
54
+ ) -> tuple[EmbeddingReqInput, ClassifyRequest]:
55
+ """Convert OpenAI embedding request to internal format"""
56
+ prompt = request.input
57
+
58
+ if isinstance(prompt, str):
59
+ # Single string input
60
+ prompt_kwargs = {"text": prompt}
61
+ elif isinstance(prompt, list):
62
+ if len(prompt) > 0 and isinstance(prompt[0], str):
63
+ prompt_kwargs = {"text": prompt}
64
+ else:
65
+ # List of integers (token IDs) or empty list
66
+ prompt_kwargs = {"input_ids": prompt}
67
+ else:
68
+ # Other types (should not happen but handle gracefully)
69
+ prompt_kwargs = {"input_ids": prompt}
70
+
71
+ adapted_request = EmbeddingReqInput(
72
+ **prompt_kwargs,
73
+ rid=request.rid,
74
+ priority=request.priority,
75
+ )
76
+
77
+ return adapted_request, request
78
+
79
+ def _validate_request(self, request: ClassifyRequest) -> Optional[str]:
80
+ """Validate that the input is not empty or whitespace only."""
81
+ if not (input := request.input):
82
+ return "Input cannot be empty"
83
+
84
+ # Handle single string
85
+ if isinstance(input, str):
86
+ if not input.strip():
87
+ return "Input cannot be empty or whitespace only"
88
+ return None
89
+
90
+ # Handle list inputs
91
+ if isinstance(input, list):
92
+ # Check first element to determine type
93
+ first_item = input[0]
94
+
95
+ if isinstance(first_item, str):
96
+ # List of strings
97
+ for i, item in enumerate(input):
98
+ if not isinstance(item, str):
99
+ return f"All items in input list must be strings"
100
+ if not item.strip():
101
+ return f"Input at index {i} cannot be empty or whitespace only"
102
+ elif isinstance(first_item, int):
103
+ # List of integers (token IDs)
104
+ for i, item in enumerate(input):
105
+ if not isinstance(item, int):
106
+ return f"All items in input list must be integers"
107
+ if item < 0:
108
+ return f"Token ID at index {i} must be non-negative"
109
+ return None
110
+
111
+ def _get_id2label_mapping(self) -> Optional[Dict[int, str]]:
112
+ """Get id2label mapping from model config."""
113
+ try:
114
+ hf_config = self.tokenizer_manager.model_config.hf_config
115
+ # Check for id2label in hf_config
116
+ if hf_config.id2label:
117
+ return hf_config.id2label
118
+ # Check for num_labels and create default mapping if needed
119
+ if hasattr(hf_config, "num_labels") and hf_config.num_labels:
120
+ num_labels = hf_config.num_labels
121
+ # Create default mapping: {0: "LABEL_0", 1: "LABEL_1", ...}
122
+ return {i: f"LABEL_{i}" for i in range(num_labels)}
123
+
124
+ except Exception as e:
125
+ logger.warning(f"Failed to get id2label mapping: {e}")
126
+
127
+ return None
128
+
129
+ async def _handle_non_streaming_request(
130
+ self,
131
+ adapted_request: EmbeddingReqInput,
132
+ request: ClassifyRequest,
133
+ raw_request: Request,
134
+ ) -> Union[ClassifyResponse, ErrorResponse, ORJSONResponse]:
135
+ """Handle non-streaming classification request."""
136
+ # Generate request ID
137
+
138
+ try:
139
+ ret = await self.tokenizer_manager.generate_request(
140
+ adapted_request, raw_request
141
+ ).__anext__()
142
+ except ValueError as e:
143
+ return self.create_error_response(str(e))
144
+
145
+ if not isinstance(ret, list):
146
+ ret = [ret]
147
+
148
+ response = self._build_classify_response(ret)
149
+ return response
150
+
151
+ def _build_classify_response(self, ret: List[Dict[str, Any]]) -> ClassifyResponse:
152
+ request_id = f"{self._request_id_prefix()}{uuid.uuid4().hex}"
153
+ created_time = int(time.time())
154
+ classify_objects = []
155
+ prompt_tokens = 0
156
+ total_latency = 0.0
157
+
158
+ for i, item in enumerate(ret):
159
+ embedding = item.get("embedding", [])
160
+ meta_info = item.get("meta_info", {})
161
+
162
+ prompt_tokens += meta_info.get("prompt_tokens", 0)
163
+ total_latency += meta_info.get("e2e_latency", 0.0)
164
+
165
+ if embedding:
166
+ try:
167
+ embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
168
+ probs = F.softmax(embedding_tensor, dim=0).tolist()
169
+
170
+ predicted_class = torch.argmax(embedding_tensor).item()
171
+
172
+ label = self.id2label[predicted_class]
173
+
174
+ except Exception as e:
175
+ logger.error(f"Error processing embedding for item {i}: {e}")
176
+ probs = [1.0]
177
+ label = "Default"
178
+ else:
179
+ probs = [1.0]
180
+ label = "Default"
181
+
182
+ classify_obj = {
183
+ "index": i,
184
+ "label": label,
185
+ "probs": probs,
186
+ "num_classes": len(probs),
187
+ }
188
+ classify_objects.append(classify_obj)
189
+
190
+ response = {
191
+ "id": request_id,
192
+ "object": "list",
193
+ "created": created_time,
194
+ "model": self.model_name,
195
+ "data": classify_objects,
196
+ "usage": {
197
+ "prompt_tokens": prompt_tokens,
198
+ "total_tokens": prompt_tokens,
199
+ "completion_tokens": 0,
200
+ "prompt_tokens_details": None,
201
+ },
202
+ }
203
+
204
+ return ClassifyResponse(**response)
@@ -93,6 +93,17 @@ class OpenAIServingCompletion(OpenAIServingBase):
93
93
  # Extract custom labels from raw request headers
94
94
  custom_labels = self.extract_custom_labels(raw_request)
95
95
 
96
+ # Resolve LoRA adapter from model parameter or explicit lora_path
97
+ lora_path = self._resolve_lora_path(request.model, request.lora_path)
98
+ if lora_path:
99
+ first_adapter = (
100
+ lora_path
101
+ if isinstance(lora_path, str)
102
+ else next((a for a in lora_path if a), None)
103
+ )
104
+ if first_adapter:
105
+ self._validate_lora_enabled(first_adapter)
106
+
96
107
  adapted_request = GenerateReqInput(
97
108
  **prompt_kwargs,
98
109
  sampling_params=sampling_params,
@@ -101,7 +112,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
101
112
  logprob_start_len=logprob_start_len,
102
113
  return_text_in_logprobs=True,
103
114
  stream=request.stream,
104
- lora_path=request.lora_path,
115
+ lora_path=lora_path,
105
116
  bootstrap_host=request.bootstrap_host,
106
117
  bootstrap_port=request.bootstrap_port,
107
118
  bootstrap_room=request.bootstrap_room,
@@ -110,6 +121,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
110
121
  extra_key=self._compute_extra_key(request),
111
122
  priority=request.priority,
112
123
  custom_labels=custom_labels,
124
+ custom_logit_processor=request.custom_logit_processor,
113
125
  )
114
126
 
115
127
  return adapted_request, request
@@ -123,6 +135,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
123
135
  "min_new_tokens": request.min_tokens,
124
136
  "stop": request.stop,
125
137
  "stop_token_ids": request.stop_token_ids,
138
+ "stop_regex": request.stop_regex,
126
139
  "top_p": request.top_p,
127
140
  "top_k": request.top_k,
128
141
  "min_p": request.min_p,
@@ -137,6 +150,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
137
150
  "ignore_eos": request.ignore_eos,
138
151
  "skip_special_tokens": request.skip_special_tokens,
139
152
  "logit_bias": request.logit_bias,
153
+ "custom_params": request.custom_params,
140
154
  }
141
155
 
142
156
  # Handle response_format constraints
@@ -14,6 +14,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional,
14
14
 
15
15
  import jinja2
16
16
  import openai.types.responses as openai_responses_types
17
+ import orjson
17
18
  from fastapi import Request
18
19
  from fastapi.responses import ORJSONResponse
19
20
  from openai.types.responses import (
@@ -778,7 +779,9 @@ class OpenAIServingResponses(OpenAIServingChat):
778
779
  # Update the status to "cancelled"
779
780
  response.status = "cancelled"
780
781
 
781
- # Abort the request
782
+ # The response_id is the same as the rid used when submitting the request
783
+ self.tokenizer_manager.abort_request(rid=response_id)
784
+
782
785
  if task := self.background_tasks.get(response_id):
783
786
  task.cancel()
784
787
  try:
@@ -1061,7 +1064,7 @@ class OpenAIServingResponses(OpenAIServingChat):
1061
1064
  ):
1062
1065
  function_name = previous_item.recipient[len("browser.") :]
1063
1066
  action = None
1064
- parsed_args = json.loads(previous_item.content[0].text)
1067
+ parsed_args = orjson.loads(previous_item.content[0].text)
1065
1068
  if function_name == "search":
1066
1069
  action = openai_responses_types.response_function_web_search.ActionSearch(
1067
1070
  type="search",
@@ -0,0 +1,144 @@
1
+ import logging
2
+ from http import HTTPStatus
3
+ from typing import List, Union
4
+
5
+ from fastapi import Request
6
+
7
+ from sglang.srt.entrypoints.openai.protocol import (
8
+ DetokenizeRequest,
9
+ DetokenizeResponse,
10
+ ErrorResponse,
11
+ TokenizeRequest,
12
+ TokenizeResponse,
13
+ )
14
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class OpenAIServingTokenize(OpenAIServingBase):
20
+ """Handler for /v1/tokenize requests"""
21
+
22
+ def _request_id_prefix(self) -> str:
23
+ return "tok-"
24
+
25
+ def _convert_to_internal_request(
26
+ self, request: TokenizeRequest, raw_request: Request
27
+ ) -> tuple[TokenizeRequest, TokenizeRequest]:
28
+ return request, request
29
+
30
+ async def _handle_non_streaming_request(
31
+ self,
32
+ adapted_request: TokenizeRequest,
33
+ request: TokenizeRequest,
34
+ raw_request: Request,
35
+ ) -> Union[TokenizeResponse, ErrorResponse]:
36
+ try:
37
+ tokenizer = self.tokenizer_manager.tokenizer
38
+ max_model_len = getattr(tokenizer, "model_max_length", -1)
39
+
40
+ if isinstance(request.prompt, str):
41
+ token_ids = tokenizer.encode(
42
+ request.prompt,
43
+ add_special_tokens=request.add_special_tokens,
44
+ )
45
+ tokens = token_ids
46
+ count = len(token_ids)
47
+ elif isinstance(request.prompt, list):
48
+ token_ids_list = [
49
+ tokenizer.encode(
50
+ text, add_special_tokens=request.add_special_tokens
51
+ )
52
+ for text in request.prompt
53
+ ]
54
+ tokens = token_ids_list
55
+ count = [len(ids) for ids in token_ids_list]
56
+ else:
57
+ return self.create_error_response(
58
+ f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
59
+ )
60
+
61
+ return TokenizeResponse(
62
+ tokens=tokens, count=count, max_model_len=max_model_len
63
+ )
64
+ except Exception as e:
65
+ logger.error("Error during tokenization", exc_info=True)
66
+ return self.create_error_response(
67
+ f"Internal server error during tokenization: {e}",
68
+ err_type="InternalServerError",
69
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
70
+ )
71
+
72
+
73
+ class OpenAIServingDetokenize(OpenAIServingBase):
74
+ """Handler for /v1/detokenize requests"""
75
+
76
+ def _request_id_prefix(self) -> str:
77
+ return "detok-"
78
+
79
+ def _convert_to_internal_request(
80
+ self, request: DetokenizeRequest, raw_request: Request
81
+ ) -> tuple[DetokenizeRequest, DetokenizeRequest]:
82
+ return request, request
83
+
84
+ async def _handle_non_streaming_request(
85
+ self,
86
+ adapted_request: DetokenizeRequest,
87
+ request: DetokenizeRequest,
88
+ raw_request: Request,
89
+ ) -> Union[DetokenizeResponse, ErrorResponse]:
90
+ try:
91
+ tokenizer = self.tokenizer_manager.tokenizer
92
+
93
+ if (
94
+ isinstance(request.tokens, list)
95
+ and request.tokens
96
+ and isinstance(request.tokens[0], int)
97
+ ):
98
+ if not all(isinstance(t, int) for t in request.tokens):
99
+ return self.create_error_response(
100
+ "Invalid input: 'tokens' must be a list of integers."
101
+ )
102
+ tokens_to_decode = [int(t) for t in request.tokens]
103
+ text = tokenizer.decode(
104
+ tokens_to_decode, skip_special_tokens=request.skip_special_tokens
105
+ )
106
+ text_out: Union[str, List[str]] = text
107
+ elif (
108
+ isinstance(request.tokens, list)
109
+ and request.tokens
110
+ and isinstance(request.tokens[0], list)
111
+ ):
112
+ texts: List[str] = []
113
+ for token_list in request.tokens:
114
+ if not all(isinstance(t, int) for t in token_list):
115
+ return self.create_error_response(
116
+ f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
117
+ )
118
+ decoded_text = tokenizer.decode(
119
+ [int(t) for t in token_list],
120
+ skip_special_tokens=request.skip_special_tokens,
121
+ )
122
+ texts.append(decoded_text)
123
+ text_out = texts
124
+ elif isinstance(request.tokens, list) and not request.tokens:
125
+ text_out = ""
126
+ else:
127
+ return self.create_error_response(
128
+ f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
129
+ )
130
+
131
+ return DetokenizeResponse(text=text_out)
132
+ except Exception as e:
133
+ logger.error("Error during detokenization", exc_info=True)
134
+ if "decode" in str(e).lower():
135
+ return self.create_error_response(
136
+ f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
137
+ err_type="DecodeError",
138
+ status_code=HTTPStatus.BAD_REQUEST,
139
+ )
140
+ return self.create_error_response(
141
+ f"Internal server error during detokenization: {e}",
142
+ err_type="InternalServerError",
143
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
144
+ )