sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,204 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import time
5
+ import uuid
6
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from fastapi import Request
11
+ from fastapi.responses import ORJSONResponse
12
+
13
+ from sglang.srt.entrypoints.openai.protocol import (
14
+ ClassifyRequest,
15
+ ClassifyResponse,
16
+ ErrorResponse,
17
+ )
18
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
19
+ from sglang.srt.managers.io_struct import EmbeddingReqInput
20
+
21
+ if TYPE_CHECKING:
22
+ from sglang.srt.managers.template_manager import TemplateManager
23
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class OpenAIServingClassify(OpenAIServingBase):
29
+ """Handler for v1/classify requests"""
30
+
31
+ def __init__(
32
+ self,
33
+ tokenizer_manager: TokenizerManager,
34
+ template_manager: TemplateManager,
35
+ ):
36
+ super().__init__(tokenizer_manager)
37
+ self.template_manager = template_manager
38
+ self.id2label = self._get_id2label_mapping()
39
+ self.model_name = (
40
+ self.tokenizer_manager.served_model_name
41
+ if self.tokenizer_manager.served_model_name
42
+ else self.tokenizer_manager.server_args.model_path
43
+ )
44
+ if not self.id2label:
45
+ raise ValueError("id2label mapping is missing")
46
+
47
+ def _request_id_prefix(self) -> str:
48
+ return "classify-"
49
+
50
+ def _convert_to_internal_request(
51
+ self,
52
+ request: ClassifyRequest,
53
+ raw_request: Request = None,
54
+ ) -> tuple[EmbeddingReqInput, ClassifyRequest]:
55
+ """Convert OpenAI embedding request to internal format"""
56
+ prompt = request.input
57
+
58
+ if isinstance(prompt, str):
59
+ # Single string input
60
+ prompt_kwargs = {"text": prompt}
61
+ elif isinstance(prompt, list):
62
+ if len(prompt) > 0 and isinstance(prompt[0], str):
63
+ prompt_kwargs = {"text": prompt}
64
+ else:
65
+ # List of integers (token IDs) or empty list
66
+ prompt_kwargs = {"input_ids": prompt}
67
+ else:
68
+ # Other types (should not happen but handle gracefully)
69
+ prompt_kwargs = {"input_ids": prompt}
70
+
71
+ adapted_request = EmbeddingReqInput(
72
+ **prompt_kwargs,
73
+ rid=request.rid,
74
+ priority=request.priority,
75
+ )
76
+
77
+ return adapted_request, request
78
+
79
+ def _validate_request(self, request: ClassifyRequest) -> Optional[str]:
80
+ """Validate that the input is not empty or whitespace only."""
81
+ if not (input := request.input):
82
+ return "Input cannot be empty"
83
+
84
+ # Handle single string
85
+ if isinstance(input, str):
86
+ if not input.strip():
87
+ return "Input cannot be empty or whitespace only"
88
+ return None
89
+
90
+ # Handle list inputs
91
+ if isinstance(input, list):
92
+ # Check first element to determine type
93
+ first_item = input[0]
94
+
95
+ if isinstance(first_item, str):
96
+ # List of strings
97
+ for i, item in enumerate(input):
98
+ if not isinstance(item, str):
99
+ return f"All items in input list must be strings"
100
+ if not item.strip():
101
+ return f"Input at index {i} cannot be empty or whitespace only"
102
+ elif isinstance(first_item, int):
103
+ # List of integers (token IDs)
104
+ for i, item in enumerate(input):
105
+ if not isinstance(item, int):
106
+ return f"All items in input list must be integers"
107
+ if item < 0:
108
+ return f"Token ID at index {i} must be non-negative"
109
+ return None
110
+
111
+ def _get_id2label_mapping(self) -> Optional[Dict[int, str]]:
112
+ """Get id2label mapping from model config."""
113
+ try:
114
+ hf_config = self.tokenizer_manager.model_config.hf_config
115
+ # Check for id2label in hf_config
116
+ if hf_config.id2label:
117
+ return hf_config.id2label
118
+ # Check for num_labels and create default mapping if needed
119
+ if hasattr(hf_config, "num_labels") and hf_config.num_labels:
120
+ num_labels = hf_config.num_labels
121
+ # Create default mapping: {0: "LABEL_0", 1: "LABEL_1", ...}
122
+ return {i: f"LABEL_{i}" for i in range(num_labels)}
123
+
124
+ except Exception as e:
125
+ logger.warning(f"Failed to get id2label mapping: {e}")
126
+
127
+ return None
128
+
129
+ async def _handle_non_streaming_request(
130
+ self,
131
+ adapted_request: EmbeddingReqInput,
132
+ request: ClassifyRequest,
133
+ raw_request: Request,
134
+ ) -> Union[ClassifyResponse, ErrorResponse, ORJSONResponse]:
135
+ """Handle non-streaming classification request."""
136
+ # Generate request ID
137
+
138
+ try:
139
+ ret = await self.tokenizer_manager.generate_request(
140
+ adapted_request, raw_request
141
+ ).__anext__()
142
+ except ValueError as e:
143
+ return self.create_error_response(str(e))
144
+
145
+ if not isinstance(ret, list):
146
+ ret = [ret]
147
+
148
+ response = self._build_classify_response(ret)
149
+ return response
150
+
151
+ def _build_classify_response(self, ret: List[Dict[str, Any]]) -> ClassifyResponse:
152
+ request_id = f"{self._request_id_prefix()}{uuid.uuid4().hex}"
153
+ created_time = int(time.time())
154
+ classify_objects = []
155
+ prompt_tokens = 0
156
+ total_latency = 0.0
157
+
158
+ for i, item in enumerate(ret):
159
+ embedding = item.get("embedding", [])
160
+ meta_info = item.get("meta_info", {})
161
+
162
+ prompt_tokens += meta_info.get("prompt_tokens", 0)
163
+ total_latency += meta_info.get("e2e_latency", 0.0)
164
+
165
+ if embedding:
166
+ try:
167
+ embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
168
+ probs = F.softmax(embedding_tensor, dim=0).tolist()
169
+
170
+ predicted_class = torch.argmax(embedding_tensor).item()
171
+
172
+ label = self.id2label[predicted_class]
173
+
174
+ except Exception as e:
175
+ logger.error(f"Error processing embedding for item {i}: {e}")
176
+ probs = [1.0]
177
+ label = "Default"
178
+ else:
179
+ probs = [1.0]
180
+ label = "Default"
181
+
182
+ classify_obj = {
183
+ "index": i,
184
+ "label": label,
185
+ "probs": probs,
186
+ "num_classes": len(probs),
187
+ }
188
+ classify_objects.append(classify_obj)
189
+
190
+ response = {
191
+ "id": request_id,
192
+ "object": "list",
193
+ "created": created_time,
194
+ "model": self.model_name,
195
+ "data": classify_objects,
196
+ "usage": {
197
+ "prompt_tokens": prompt_tokens,
198
+ "total_tokens": prompt_tokens,
199
+ "completion_tokens": 0,
200
+ "prompt_tokens_details": None,
201
+ },
202
+ }
203
+
204
+ return ClassifyResponse(**response)
@@ -90,8 +90,19 @@ class OpenAIServingCompletion(OpenAIServingBase):
90
90
  else:
91
91
  prompt_kwargs = {"input_ids": prompt}
92
92
 
93
- # Extract customer labels from raw request headers
94
- customer_labels = self.extract_customer_labels(raw_request)
93
+ # Extract custom labels from raw request headers
94
+ custom_labels = self.extract_custom_labels(raw_request)
95
+
96
+ # Resolve LoRA adapter from model parameter or explicit lora_path
97
+ lora_path = self._resolve_lora_path(request.model, request.lora_path)
98
+ if lora_path:
99
+ first_adapter = (
100
+ lora_path
101
+ if isinstance(lora_path, str)
102
+ else next((a for a in lora_path if a), None)
103
+ )
104
+ if first_adapter:
105
+ self._validate_lora_enabled(first_adapter)
95
106
 
96
107
  adapted_request = GenerateReqInput(
97
108
  **prompt_kwargs,
@@ -101,13 +112,16 @@ class OpenAIServingCompletion(OpenAIServingBase):
101
112
  logprob_start_len=logprob_start_len,
102
113
  return_text_in_logprobs=True,
103
114
  stream=request.stream,
104
- lora_path=request.lora_path,
115
+ lora_path=lora_path,
105
116
  bootstrap_host=request.bootstrap_host,
106
117
  bootstrap_port=request.bootstrap_port,
107
118
  bootstrap_room=request.bootstrap_room,
108
119
  return_hidden_states=request.return_hidden_states,
109
120
  rid=request.rid,
110
- customer_labels=customer_labels,
121
+ extra_key=self._compute_extra_key(request),
122
+ priority=request.priority,
123
+ custom_labels=custom_labels,
124
+ custom_logit_processor=request.custom_logit_processor,
111
125
  )
112
126
 
113
127
  return adapted_request, request
@@ -121,6 +135,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
121
135
  "min_new_tokens": request.min_tokens,
122
136
  "stop": request.stop,
123
137
  "stop_token_ids": request.stop_token_ids,
138
+ "stop_regex": request.stop_regex,
124
139
  "top_p": request.top_p,
125
140
  "top_k": request.top_k,
126
141
  "min_p": request.min_p,
@@ -135,6 +150,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
135
150
  "ignore_eos": request.ignore_eos,
136
151
  "skip_special_tokens": request.skip_special_tokens,
137
152
  "logit_bias": request.logit_bias,
153
+ "custom_params": request.custom_params,
138
154
  }
139
155
 
140
156
  # Handle response_format constraints
@@ -125,6 +125,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
125
125
  adapted_request = EmbeddingReqInput(
126
126
  **prompt_kwargs,
127
127
  rid=request.rid,
128
+ priority=request.priority,
128
129
  )
129
130
 
130
131
  return adapted_request, request
@@ -14,6 +14,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional,
14
14
 
15
15
  import jinja2
16
16
  import openai.types.responses as openai_responses_types
17
+ import orjson
17
18
  from fastapi import Request
18
19
  from fastapi.responses import ORJSONResponse
19
20
  from openai.types.responses import (
@@ -123,6 +124,39 @@ class OpenAIServingResponses(OpenAIServingChat):
123
124
 
124
125
  self.background_tasks: dict[str, asyncio.Task] = {}
125
126
 
127
+ # error helpers dedicated for v1/responses
128
+ def create_error_response(
129
+ self,
130
+ message: str,
131
+ err_type: str = "invalid_request_error",
132
+ status_code: int = 400,
133
+ param: Optional[str] = None,
134
+ ) -> ORJSONResponse:
135
+ nested_error = {
136
+ "message": message,
137
+ "type": err_type,
138
+ "param": param,
139
+ "code": status_code,
140
+ }
141
+ return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
142
+
143
+ def create_streaming_error_response(
144
+ self,
145
+ message: str,
146
+ err_type: str = "BadRequestError",
147
+ status_code: int = 400,
148
+ ) -> str:
149
+ return json.dumps(
150
+ {
151
+ "error": {
152
+ "message": message,
153
+ "type": err_type,
154
+ "param": None,
155
+ "code": status_code,
156
+ }
157
+ }
158
+ )
159
+
126
160
  def _request_id_prefix(self) -> str:
127
161
  return "resp_"
128
162
 
@@ -245,6 +279,7 @@ class OpenAIServingResponses(OpenAIServingChat):
245
279
  sampling_params=sampling_params,
246
280
  stream=request.stream,
247
281
  rid=request.request_id,
282
+ extra_key=self._compute_extra_key(request),
248
283
  background=request.background,
249
284
  )
250
285
 
@@ -744,7 +779,9 @@ class OpenAIServingResponses(OpenAIServingChat):
744
779
  # Update the status to "cancelled"
745
780
  response.status = "cancelled"
746
781
 
747
- # Abort the request
782
+ # The response_id is the same as the rid used when submitting the request
783
+ self.tokenizer_manager.abort_request(rid=response_id)
784
+
748
785
  if task := self.background_tasks.get(response_id):
749
786
  task.cancel()
750
787
  try:
@@ -833,6 +870,13 @@ class OpenAIServingResponses(OpenAIServingChat):
833
870
 
834
871
  async for ctx in result_generator:
835
872
 
873
+ # Only process context objects that implement the `is_expecting_start()` method,
874
+ # which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
875
+ # Contexts without this method are skipped, as they do not represent a new turn
876
+ # or are not compatible with per-turn handling in the /v1/responses endpoint.
877
+ if not hasattr(ctx, "is_expecting_start"):
878
+ continue
879
+
836
880
  if ctx.is_expecting_start():
837
881
  current_output_index += 1
838
882
  sent_output_item_added = False
@@ -1020,7 +1064,7 @@ class OpenAIServingResponses(OpenAIServingChat):
1020
1064
  ):
1021
1065
  function_name = previous_item.recipient[len("browser.") :]
1022
1066
  action = None
1023
- parsed_args = json.loads(previous_item.content[0].text)
1067
+ parsed_args = orjson.loads(previous_item.content[0].text)
1024
1068
  if function_name == "search":
1025
1069
  action = openai_responses_types.response_function_web_search.ActionSearch(
1026
1070
  type="search",
@@ -1250,6 +1294,7 @@ class OpenAIServingResponses(OpenAIServingChat):
1250
1294
  sampling_params=sampling_params,
1251
1295
  stream=adapted_request.stream,
1252
1296
  rid=request_id,
1297
+ extra_key=adapted_request.extra_key,
1253
1298
  return_logprob=adapted_request.return_logprob,
1254
1299
  logprob_start_len=adapted_request.logprob_start_len,
1255
1300
  top_logprobs_num=adapted_request.top_logprobs_num,
@@ -0,0 +1,144 @@
1
+ import logging
2
+ from http import HTTPStatus
3
+ from typing import List, Union
4
+
5
+ from fastapi import Request
6
+
7
+ from sglang.srt.entrypoints.openai.protocol import (
8
+ DetokenizeRequest,
9
+ DetokenizeResponse,
10
+ ErrorResponse,
11
+ TokenizeRequest,
12
+ TokenizeResponse,
13
+ )
14
+ from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class OpenAIServingTokenize(OpenAIServingBase):
20
+ """Handler for /v1/tokenize requests"""
21
+
22
+ def _request_id_prefix(self) -> str:
23
+ return "tok-"
24
+
25
+ def _convert_to_internal_request(
26
+ self, request: TokenizeRequest, raw_request: Request
27
+ ) -> tuple[TokenizeRequest, TokenizeRequest]:
28
+ return request, request
29
+
30
+ async def _handle_non_streaming_request(
31
+ self,
32
+ adapted_request: TokenizeRequest,
33
+ request: TokenizeRequest,
34
+ raw_request: Request,
35
+ ) -> Union[TokenizeResponse, ErrorResponse]:
36
+ try:
37
+ tokenizer = self.tokenizer_manager.tokenizer
38
+ max_model_len = getattr(tokenizer, "model_max_length", -1)
39
+
40
+ if isinstance(request.prompt, str):
41
+ token_ids = tokenizer.encode(
42
+ request.prompt,
43
+ add_special_tokens=request.add_special_tokens,
44
+ )
45
+ tokens = token_ids
46
+ count = len(token_ids)
47
+ elif isinstance(request.prompt, list):
48
+ token_ids_list = [
49
+ tokenizer.encode(
50
+ text, add_special_tokens=request.add_special_tokens
51
+ )
52
+ for text in request.prompt
53
+ ]
54
+ tokens = token_ids_list
55
+ count = [len(ids) for ids in token_ids_list]
56
+ else:
57
+ return self.create_error_response(
58
+ f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
59
+ )
60
+
61
+ return TokenizeResponse(
62
+ tokens=tokens, count=count, max_model_len=max_model_len
63
+ )
64
+ except Exception as e:
65
+ logger.error("Error during tokenization", exc_info=True)
66
+ return self.create_error_response(
67
+ f"Internal server error during tokenization: {e}",
68
+ err_type="InternalServerError",
69
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
70
+ )
71
+
72
+
73
+ class OpenAIServingDetokenize(OpenAIServingBase):
74
+ """Handler for /v1/detokenize requests"""
75
+
76
+ def _request_id_prefix(self) -> str:
77
+ return "detok-"
78
+
79
+ def _convert_to_internal_request(
80
+ self, request: DetokenizeRequest, raw_request: Request
81
+ ) -> tuple[DetokenizeRequest, DetokenizeRequest]:
82
+ return request, request
83
+
84
+ async def _handle_non_streaming_request(
85
+ self,
86
+ adapted_request: DetokenizeRequest,
87
+ request: DetokenizeRequest,
88
+ raw_request: Request,
89
+ ) -> Union[DetokenizeResponse, ErrorResponse]:
90
+ try:
91
+ tokenizer = self.tokenizer_manager.tokenizer
92
+
93
+ if (
94
+ isinstance(request.tokens, list)
95
+ and request.tokens
96
+ and isinstance(request.tokens[0], int)
97
+ ):
98
+ if not all(isinstance(t, int) for t in request.tokens):
99
+ return self.create_error_response(
100
+ "Invalid input: 'tokens' must be a list of integers."
101
+ )
102
+ tokens_to_decode = [int(t) for t in request.tokens]
103
+ text = tokenizer.decode(
104
+ tokens_to_decode, skip_special_tokens=request.skip_special_tokens
105
+ )
106
+ text_out: Union[str, List[str]] = text
107
+ elif (
108
+ isinstance(request.tokens, list)
109
+ and request.tokens
110
+ and isinstance(request.tokens[0], list)
111
+ ):
112
+ texts: List[str] = []
113
+ for token_list in request.tokens:
114
+ if not all(isinstance(t, int) for t in token_list):
115
+ return self.create_error_response(
116
+ f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
117
+ )
118
+ decoded_text = tokenizer.decode(
119
+ [int(t) for t in token_list],
120
+ skip_special_tokens=request.skip_special_tokens,
121
+ )
122
+ texts.append(decoded_text)
123
+ text_out = texts
124
+ elif isinstance(request.tokens, list) and not request.tokens:
125
+ text_out = ""
126
+ else:
127
+ return self.create_error_response(
128
+ f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
129
+ )
130
+
131
+ return DetokenizeResponse(text=text_out)
132
+ except Exception as e:
133
+ logger.error("Error during detokenization", exc_info=True)
134
+ if "decode" in str(e).lower():
135
+ return self.create_error_response(
136
+ f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
137
+ err_type="DecodeError",
138
+ status_code=HTTPStatus.BAD_REQUEST,
139
+ )
140
+ return self.create_error_response(
141
+ f"Internal server error during detokenization: {e}",
142
+ err_type="InternalServerError",
143
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
144
+ )