sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -13,15 +13,18 @@
13
13
  # ==============================================================================
14
14
  """Pydantic models for OpenAI API protocol"""
15
15
 
16
+ import logging
16
17
  import time
17
18
  import uuid
18
19
  from dataclasses import dataclass
19
- from typing import Any, Dict, List, Optional, TypeAlias, Union
20
+ from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
20
21
 
21
22
  from openai.types.responses import (
22
23
  ResponseFunctionToolCall,
23
24
  ResponseInputItemParam,
24
25
  ResponseOutputItem,
26
+ ResponseOutputMessage,
27
+ ResponseOutputText,
25
28
  ResponseReasoningItem,
26
29
  )
27
30
  from openai.types.responses.response import ToolChoice
@@ -35,6 +38,10 @@ from pydantic import (
35
38
  )
36
39
  from typing_extensions import Literal
37
40
 
41
+ from sglang.utils import convert_json_schema_to_str
42
+
43
+ logger = logging.getLogger(__name__)
44
+
38
45
  DEFAULT_MODEL_NAME = "default"
39
46
 
40
47
 
@@ -228,6 +235,15 @@ class CompletionRequest(BaseModel):
228
235
 
229
236
  # For request id
230
237
  rid: Optional[Union[List[str], str]] = None
238
+ # Extra key for classifying the request (e.g. cache_salt)
239
+ extra_key: Optional[Union[List[str], str]] = None
240
+ # Cache salt for request caching
241
+ cache_salt: Optional[Union[List[str], str]] = None
242
+ # Priority for the request
243
+ priority: Optional[int] = None
244
+
245
+ # For custom metric labels
246
+ custom_labels: Optional[Dict[str, str]] = None
231
247
 
232
248
  @field_validator("max_tokens")
233
249
  @classmethod
@@ -334,7 +350,7 @@ class FunctionResponse(BaseModel):
334
350
  """Function response."""
335
351
 
336
352
  name: Optional[str] = None
337
- arguments: Optional[str] = None
353
+ arguments: Optional[str | Dict[str, Any]] = None
338
354
 
339
355
 
340
356
  class ToolCall(BaseModel):
@@ -383,7 +399,7 @@ class Function(BaseModel):
383
399
  """Function descriptions."""
384
400
 
385
401
  description: Optional[str] = Field(default=None, examples=[None])
386
- name: Optional[str] = None
402
+ name: str
387
403
  parameters: Optional[object] = None
388
404
  strict: bool = False
389
405
 
@@ -434,8 +450,8 @@ class ChatCompletionRequest(BaseModel):
434
450
  stop: Optional[Union[str, List[str]]] = None
435
451
  stream: bool = False
436
452
  stream_options: Optional[StreamOptions] = None
437
- temperature: float = 0.7
438
- top_p: float = 1.0
453
+ temperature: Optional[float] = None
454
+ top_p: Optional[float] = None
439
455
  user: Optional[str] = None
440
456
  tools: Optional[List[Tool]] = Field(default=None, examples=[None])
441
457
  tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
@@ -447,9 +463,50 @@ class ChatCompletionRequest(BaseModel):
447
463
  description="Constrains effort on reasoning for reasoning models. "
448
464
  "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
449
465
  "result in faster responses and fewer tokens used on reasoning in a response. "
450
- "Currently only supported for OpenAI models.",
466
+ "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
451
467
  )
452
468
 
469
+ # Extra parameters for SRT backend only and will be ignored by OpenAI models.
470
+ top_k: Optional[int] = None
471
+ min_p: Optional[float] = None
472
+ min_tokens: int = 0
473
+ regex: Optional[str] = None
474
+ ebnf: Optional[str] = None
475
+ repetition_penalty: Optional[float] = None
476
+ stop_token_ids: Optional[List[int]] = None
477
+ no_stop_trim: bool = False
478
+ ignore_eos: bool = False
479
+ continue_final_message: bool = False
480
+ skip_special_tokens: bool = True
481
+ lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
482
+ session_params: Optional[Dict] = None
483
+ separate_reasoning: bool = True
484
+ stream_reasoning: bool = True
485
+ chat_template_kwargs: Optional[Dict] = None
486
+
487
+ # For request id
488
+ rid: Optional[Union[List[str], str]] = None
489
+ # Extra key for classifying the request (e.g. cache_salt)
490
+ extra_key: Optional[Union[List[str], str]] = None
491
+ # Cache salt for request caching
492
+ cache_salt: Optional[Union[List[str], str]] = None
493
+ # Priority for the request
494
+ priority: Optional[int] = None
495
+
496
+ # For PD disaggregation
497
+ bootstrap_host: Optional[Union[List[str], str]] = None
498
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
499
+ bootstrap_room: Optional[Union[List[int], int]] = None
500
+
501
+ # OpenAI/SGLang default sampling parameters
502
+ _DEFAULT_SAMPLING_PARAMS = {
503
+ "temperature": 1.0,
504
+ "top_p": 1.0,
505
+ "top_k": -1,
506
+ "min_p": 0.0,
507
+ "repetition_penalty": 1.0,
508
+ }
509
+
453
510
  @model_validator(mode="before")
454
511
  @classmethod
455
512
  def set_tool_choice_default(cls, values):
@@ -520,31 +577,81 @@ class ChatCompletionRequest(BaseModel):
520
577
 
521
578
  return values
522
579
 
523
- # Extra parameters for SRT backend only and will be ignored by OpenAI models.
524
- top_k: int = -1
525
- min_p: float = 0.0
526
- min_tokens: int = 0
527
- regex: Optional[str] = None
528
- ebnf: Optional[str] = None
529
- repetition_penalty: float = 1.0
530
- stop_token_ids: Optional[List[int]] = None
531
- no_stop_trim: bool = False
532
- ignore_eos: bool = False
533
- continue_final_message: bool = False
534
- skip_special_tokens: bool = True
535
- lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
536
- session_params: Optional[Dict] = None
537
- separate_reasoning: bool = True
538
- stream_reasoning: bool = True
539
- chat_template_kwargs: Optional[Dict] = None
580
+ def to_sampling_params(
581
+ self,
582
+ stop: List[str],
583
+ model_generation_config: Dict[str, Any],
584
+ tool_call_constraint: Optional[Any] = None,
585
+ ) -> Dict[str, Any]:
586
+ """
587
+ Convert request to sampling parameters.
588
+ Priority: user value > model generation_config > OpenAI defaults
589
+ """
590
+
591
+ def get_param(param_name: str):
592
+ value = getattr(self, param_name)
593
+ if value is None:
594
+ return model_generation_config.get(
595
+ param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
596
+ )
597
+ return value
598
+
599
+ sampling_params = {
600
+ "temperature": get_param("temperature"),
601
+ "max_new_tokens": self.max_tokens or self.max_completion_tokens,
602
+ "min_new_tokens": self.min_tokens,
603
+ "stop": stop,
604
+ "stop_token_ids": self.stop_token_ids,
605
+ "top_p": get_param("top_p"),
606
+ "top_k": get_param("top_k"),
607
+ "min_p": get_param("min_p"),
608
+ "presence_penalty": self.presence_penalty,
609
+ "frequency_penalty": self.frequency_penalty,
610
+ "repetition_penalty": get_param("repetition_penalty"),
611
+ "regex": self.regex,
612
+ "ebnf": self.ebnf,
613
+ "n": self.n,
614
+ "no_stop_trim": self.no_stop_trim,
615
+ "ignore_eos": self.ignore_eos,
616
+ "skip_special_tokens": self.skip_special_tokens,
617
+ "logit_bias": self.logit_bias,
618
+ }
540
619
 
541
- # For request id
542
- rid: Optional[Union[List[str], str]] = None
620
+ if self.response_format and self.response_format.type == "json_schema":
621
+ sampling_params["json_schema"] = convert_json_schema_to_str(
622
+ self.response_format.json_schema.schema_
623
+ )
624
+ elif self.response_format and self.response_format.type == "json_object":
625
+ sampling_params["json_schema"] = '{"type": "object"}'
626
+ elif self.response_format and self.response_format.type == "structural_tag":
627
+ sampling_params["structural_tag"] = convert_json_schema_to_str(
628
+ self.response_format.model_dump(by_alias=True)
629
+ )
543
630
 
544
- # For PD disaggregation
545
- bootstrap_host: Optional[Union[List[str], str]] = None
546
- bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
547
- bootstrap_room: Optional[Union[List[int], int]] = None
631
+ # Check if there are already existing output constraints
632
+ has_existing_constraints = (
633
+ sampling_params.get("regex")
634
+ or sampling_params.get("ebnf")
635
+ or sampling_params.get("structural_tag")
636
+ or sampling_params.get("json_schema")
637
+ )
638
+
639
+ if tool_call_constraint and has_existing_constraints:
640
+ logger.warning("Constrained decoding is not compatible with tool calls.")
641
+ elif tool_call_constraint:
642
+ constraint_type, constraint_value = tool_call_constraint
643
+ if constraint_type == "structural_tag":
644
+ sampling_params[constraint_type] = convert_json_schema_to_str(
645
+ constraint_value.model_dump(by_alias=True)
646
+ )
647
+ elif constraint_type == "json_schema":
648
+ sampling_params[constraint_type] = convert_json_schema_to_str(
649
+ constraint_value
650
+ )
651
+ else:
652
+ sampling_params[constraint_type] = constraint_value
653
+
654
+ return sampling_params
548
655
 
549
656
 
550
657
  class ChatMessage(BaseModel):
@@ -641,6 +748,8 @@ class EmbeddingRequest(BaseModel):
641
748
 
642
749
  # The request id.
643
750
  rid: Optional[Union[List[str], str]] = None
751
+ # Priority for the request
752
+ priority: Optional[int] = None
644
753
 
645
754
 
646
755
  class EmbeddingObject(BaseModel):
@@ -692,12 +801,50 @@ class RerankResponse(BaseModel):
692
801
  meta_info: Optional[dict] = None
693
802
 
694
803
 
804
+ class TokenizeRequest(BaseModel):
805
+ """Request schema for the /tokenize endpoint."""
806
+
807
+ model: str = DEFAULT_MODEL_NAME
808
+ prompt: Union[str, List[str]]
809
+ add_special_tokens: bool = Field(
810
+ default=True,
811
+ description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
812
+ )
813
+
814
+
815
+ class TokenizeResponse(BaseModel):
816
+ """Response schema for the /tokenize endpoint."""
817
+
818
+ tokens: Union[List[int], List[List[int]]]
819
+ count: Union[int, List[int]]
820
+ max_model_len: int
821
+
822
+
823
+ class DetokenizeRequest(BaseModel):
824
+ """Request schema for the /detokenize endpoint."""
825
+
826
+ model: str = DEFAULT_MODEL_NAME
827
+ tokens: Union[List[int], List[List[int]]]
828
+ skip_special_tokens: bool = Field(
829
+ default=True,
830
+ description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
831
+ )
832
+
833
+
834
+ class DetokenizeResponse(BaseModel):
835
+ """Response schema for the /detokenize endpoint."""
836
+
837
+ text: Union[str, List[str]]
838
+
839
+
695
840
  OpenAIServingRequest = Union[
696
841
  ChatCompletionRequest,
697
842
  CompletionRequest,
698
843
  EmbeddingRequest,
699
844
  ScoringRequest,
700
845
  V1RerankReqInput,
846
+ TokenizeRequest,
847
+ DetokenizeRequest,
701
848
  ]
702
849
 
703
850
 
@@ -769,6 +916,13 @@ class ResponsesRequest(BaseModel):
769
916
  description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
770
917
  )
771
918
  priority: int = Field(default=0, description="Request priority")
919
+ extra_key: Optional[str] = Field(
920
+ default=None,
921
+ description="Extra key for classifying the request (e.g. cache_salt)",
922
+ )
923
+ cache_salt: Optional[str] = Field(
924
+ default=None, description="Cache salt for request caching"
925
+ )
772
926
 
773
927
  # SGLang-specific sampling parameters
774
928
  frequency_penalty: float = 0.0
@@ -857,6 +1011,26 @@ class ResponsesResponse(BaseModel):
857
1011
  tool_choice: str = "auto"
858
1012
  tools: List[ResponseTool] = Field(default_factory=list)
859
1013
 
1014
+ # OpenAI compatibility fields. not all are used at the moment.
1015
+ # Recommend checking https://platform.openai.com/docs/api-reference/responses
1016
+ error: Optional[dict] = None
1017
+ incomplete_details: Optional[dict] = None # TODO(v) support this input
1018
+ instructions: Optional[str] = None
1019
+ max_output_tokens: Optional[int] = None
1020
+ previous_response_id: Optional[str] = None
1021
+ reasoning: Optional[dict] = (
1022
+ # Unused. No model supports this. For GPT-oss, system prompt sets
1023
+ # the field, not server args.
1024
+ None # {"effort": Optional[str], "summary": Optional[str]}
1025
+ )
1026
+ store: Optional[bool] = None
1027
+ temperature: Optional[float] = None
1028
+ text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
1029
+ top_p: Optional[float] = None
1030
+ truncation: Optional[str] = None
1031
+ user: Optional[str] = None
1032
+ metadata: Optional[Dict[str, Any]] = None
1033
+
860
1034
  @classmethod
861
1035
  def from_request(
862
1036
  cls,
@@ -871,6 +1045,41 @@ class ResponsesResponse(BaseModel):
871
1045
  usage: Optional[UsageInfo],
872
1046
  ) -> "ResponsesResponse":
873
1047
  """Create a response from a request."""
1048
+
1049
+ # Determine if the output is plain text only to set text.format
1050
+ def _is_text_only(
1051
+ items: List[
1052
+ Union[
1053
+ ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
1054
+ ]
1055
+ ]
1056
+ ) -> bool:
1057
+ if not items:
1058
+ return False
1059
+ for it in items:
1060
+ # tool call -> not pure text.
1061
+ if isinstance(it, ResponseReasoningItem) or isinstance(
1062
+ it, ResponseFunctionToolCall
1063
+ ):
1064
+ return False
1065
+ try:
1066
+ if isinstance(it, ResponseOutputText):
1067
+ continue
1068
+ elif isinstance(it, ResponseOutputMessage):
1069
+ if not it.content:
1070
+ continue
1071
+ for c in it.content:
1072
+ if not isinstance(c, ResponseOutputText):
1073
+ return False
1074
+ else:
1075
+ # Unknown type, not considered text-only
1076
+ return False
1077
+ except AttributeError:
1078
+ return False
1079
+ return True
1080
+
1081
+ text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
1082
+
874
1083
  return cls(
875
1084
  id=request.request_id,
876
1085
  created_at=created_time,
@@ -881,6 +1090,23 @@ class ResponsesResponse(BaseModel):
881
1090
  parallel_tool_calls=request.parallel_tool_calls or True,
882
1091
  tool_choice=request.tool_choice,
883
1092
  tools=request.tools,
1093
+ # fields for parity with v1/responses
1094
+ error=None,
1095
+ incomplete_details=None,
1096
+ instructions=request.instructions,
1097
+ max_output_tokens=request.max_output_tokens,
1098
+ previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
1099
+ reasoning={
1100
+ "effort": request.reasoning.effort if request.reasoning else None,
1101
+ "summary": None, # unused
1102
+ },
1103
+ store=request.store,
1104
+ temperature=request.temperature,
1105
+ text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
1106
+ top_p=request.top_p,
1107
+ truncation=request.truncation,
1108
+ user=request.user,
1109
+ metadata=request.metadata or {},
884
1110
  )
885
1111
 
886
1112
 
@@ -919,6 +1145,16 @@ class MessageProcessingResult:
919
1145
  tool_call_constraint: Optional[Any] = None
920
1146
 
921
1147
 
1148
+ class ToolCallProcessingResult(NamedTuple):
1149
+ """Result of processing tool calls in a response."""
1150
+
1151
+ tool_calls: Optional[
1152
+ List[Any]
1153
+ ] # List of ToolCall objects or None if parsing failed
1154
+ remaining_text: str # Text remaining after parsing tool calls
1155
+ finish_reason: Dict[str, Any] # Updated finish reason dictionary
1156
+
1157
+
922
1158
  class ResponseReasoningTextContent(BaseModel):
923
1159
  text: str
924
1160
  type: Literal["reasoning_text"] = "reasoning_text"
@@ -1,15 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import uuid
4
6
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
8
 
7
9
  from fastapi import HTTPException, Request
8
10
  from fastapi.responses import ORJSONResponse, StreamingResponse
9
11
 
10
12
  from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
13
  from sglang.srt.managers.io_struct import GenerateReqInput
12
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
14
+ from sglang.srt.server_args import ServerArgs
15
+
16
+ if TYPE_CHECKING:
17
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
18
 
14
19
  logger = logging.getLogger(__name__)
15
20
 
@@ -20,6 +25,14 @@ class OpenAIServingBase(ABC):
20
25
 
21
26
  def __init__(self, tokenizer_manager: TokenizerManager):
22
27
  self.tokenizer_manager = tokenizer_manager
28
+ self.allowed_custom_labels = (
29
+ set(
30
+ self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
31
+ )
32
+ if isinstance(self.tokenizer_manager.server_args, ServerArgs)
33
+ and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
34
+ else None
35
+ )
23
36
 
24
37
  async def handle_request(
25
38
  self, request: OpenAIServingRequest, raw_request: Request
@@ -33,7 +46,7 @@ class OpenAIServingBase(ABC):
33
46
 
34
47
  # Convert to internal format
35
48
  adapted_request, processed_request = self._convert_to_internal_request(
36
- request
49
+ request, raw_request
37
50
  )
38
51
 
39
52
  # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
@@ -49,6 +62,12 @@ class OpenAIServingBase(ABC):
49
62
  return self.create_error_response(
50
63
  message=e.detail, err_type=str(e.status_code), status_code=e.status_code
51
64
  )
65
+ except ValueError as e:
66
+ return self.create_error_response(
67
+ message=str(e),
68
+ err_type="BadRequest",
69
+ status_code=400,
70
+ )
52
71
  except Exception as e:
53
72
  logger.exception(f"Error in request: {e}")
54
73
  return self.create_error_response(
@@ -73,10 +92,24 @@ class OpenAIServingBase(ABC):
73
92
 
74
93
  return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
75
94
 
95
+ def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
96
+ """Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
97
+ parts = []
98
+ for key in ["cache_salt", "extra_key"]:
99
+ value = getattr(request, key, None)
100
+ if value:
101
+ if not isinstance(value, str):
102
+ raise TypeError(
103
+ f"Value of {key} must be a string, but got {type(value).__name__}"
104
+ )
105
+ parts.append(value)
106
+ return "".join(parts) if parts else None
107
+
76
108
  @abstractmethod
77
109
  def _convert_to_internal_request(
78
110
  self,
79
111
  request: OpenAIServingRequest,
112
+ raw_request: Request = None,
80
113
  ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
81
114
  """Convert OpenAI request to internal format"""
82
115
  pass
@@ -150,3 +183,32 @@ class OpenAIServingBase(ABC):
150
183
  code=status_code,
151
184
  )
152
185
  return json.dumps({"error": error.model_dump()})
186
+
187
+ def extract_custom_labels(self, raw_request):
188
+ if (
189
+ not self.allowed_custom_labels
190
+ or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
191
+ ):
192
+ return None
193
+
194
+ custom_labels = None
195
+ header = (
196
+ self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
197
+ )
198
+ try:
199
+ raw_labels = (
200
+ json.loads(raw_request.headers.get(header))
201
+ if raw_request and raw_request.headers.get(header)
202
+ else None
203
+ )
204
+ except json.JSONDecodeError as e:
205
+ logger.exception(f"Error in request: {e}")
206
+ raw_labels = None
207
+
208
+ if isinstance(raw_labels, dict):
209
+ custom_labels = {
210
+ label: value
211
+ for label, value in raw_labels.items()
212
+ if label in self.allowed_custom_labels
213
+ }
214
+ return custom_labels