sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +321 -31
  3. sglang/bench_serving.py +10 -3
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +4 -0
  11. sglang/srt/configs/dots_ocr.py +64 -0
  12. sglang/srt/configs/falcon_h1.py +360 -0
  13. sglang/srt/configs/load_config.py +8 -0
  14. sglang/srt/configs/model_config.py +160 -105
  15. sglang/srt/configs/qwen3_vl.py +586 -0
  16. sglang/srt/constrained/base_grammar_backend.py +1 -0
  17. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  18. sglang/srt/constrained/xgrammar_backend.py +6 -4
  19. sglang/srt/debug_utils/dumper.py +10 -3
  20. sglang/srt/disaggregation/ascend/conn.py +2 -2
  21. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  22. sglang/srt/disaggregation/common/conn.py +266 -98
  23. sglang/srt/disaggregation/decode.py +50 -9
  24. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  26. sglang/srt/disaggregation/mooncake/conn.py +51 -541
  27. sglang/srt/disaggregation/nixl/conn.py +148 -39
  28. sglang/srt/disaggregation/prefill.py +31 -14
  29. sglang/srt/disaggregation/utils.py +36 -5
  30. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  31. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  32. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  33. sglang/srt/distributed/parallel_state.py +135 -80
  34. sglang/srt/entrypoints/engine.py +23 -3
  35. sglang/srt/entrypoints/grpc_request_manager.py +330 -55
  36. sglang/srt/entrypoints/grpc_server.py +232 -102
  37. sglang/srt/entrypoints/http_server.py +49 -9
  38. sglang/srt/entrypoints/openai/protocol.py +110 -5
  39. sglang/srt/entrypoints/openai/serving_base.py +25 -6
  40. sglang/srt/entrypoints/openai/serving_chat.py +178 -49
  41. sglang/srt/entrypoints/openai/serving_completions.py +5 -3
  42. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  43. sglang/srt/entrypoints/openai/serving_responses.py +42 -0
  44. sglang/srt/environ.py +285 -0
  45. sglang/srt/eplb/expert_location.py +30 -5
  46. sglang/srt/function_call/function_call_parser.py +3 -2
  47. sglang/srt/function_call/glm4_moe_detector.py +3 -3
  48. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  49. sglang/srt/function_call/json_array_parser.py +63 -0
  50. sglang/srt/function_call/kimik2_detector.py +17 -4
  51. sglang/srt/function_call/utils.py +96 -5
  52. sglang/srt/grpc/compile_proto.py +245 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
  56. sglang/srt/layers/activation.py +7 -6
  57. sglang/srt/layers/attention/aiter_backend.py +14 -15
  58. sglang/srt/layers/attention/ascend_backend.py +108 -9
  59. sglang/srt/layers/attention/attention_registry.py +206 -0
  60. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  61. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  62. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  63. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  66. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  67. sglang/srt/layers/attention/flashinfer_backend.py +112 -194
  68. sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
  69. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  70. sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
  71. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
  72. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
  73. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
  74. sglang/srt/layers/attention/mamba/mamba.py +566 -1
  75. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  76. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  77. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  78. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  79. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  80. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  81. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  82. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  83. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  84. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  85. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  86. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  87. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  88. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  89. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  90. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  91. sglang/srt/layers/attention/nsa/utils.py +24 -0
  92. sglang/srt/layers/attention/nsa_backend.py +887 -0
  93. sglang/srt/layers/attention/tbo_backend.py +6 -6
  94. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  95. sglang/srt/layers/attention/triton_backend.py +42 -9
  96. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  97. sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
  98. sglang/srt/layers/attention/vision.py +58 -0
  99. sglang/srt/layers/attention/wave_backend.py +4 -4
  100. sglang/srt/layers/communicator.py +8 -0
  101. sglang/srt/layers/dp_attention.py +11 -1
  102. sglang/srt/layers/elementwise.py +3 -1
  103. sglang/srt/layers/layernorm.py +2 -0
  104. sglang/srt/layers/linear.py +21 -4
  105. sglang/srt/layers/logits_processor.py +15 -2
  106. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  107. sglang/srt/layers/moe/ep_moe/layer.py +147 -74
  108. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
  113. sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
  114. sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
  115. sglang/srt/layers/moe/utils.py +10 -0
  116. sglang/srt/layers/parameter.py +23 -6
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  119. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  121. sglang/srt/layers/quantization/fp8.py +2 -2
  122. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  123. sglang/srt/layers/quantization/modelopt_quant.py +44 -9
  124. sglang/srt/layers/quantization/mxfp4.py +12 -4
  125. sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
  126. sglang/srt/layers/quantization/w4afp8.py +0 -4
  127. sglang/srt/layers/quantization/w8a8_int8.py +15 -3
  128. sglang/srt/layers/rotary_embedding.py +78 -31
  129. sglang/srt/layers/sampler.py +52 -4
  130. sglang/srt/layers/utils.py +23 -0
  131. sglang/srt/lora/backend/base_backend.py +3 -3
  132. sglang/srt/lora/backend/chunked_backend.py +348 -0
  133. sglang/srt/lora/backend/triton_backend.py +10 -4
  134. sglang/srt/lora/lora.py +7 -5
  135. sglang/srt/lora/lora_manager.py +17 -6
  136. sglang/srt/lora/mem_pool.py +1 -1
  137. sglang/srt/lora/triton_ops/__init__.py +4 -0
  138. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  139. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  140. sglang/srt/lora/utils.py +7 -5
  141. sglang/srt/managers/cache_controller.py +42 -142
  142. sglang/srt/managers/data_parallel_controller.py +11 -46
  143. sglang/srt/managers/detokenizer_manager.py +11 -11
  144. sglang/srt/managers/io_struct.py +162 -118
  145. sglang/srt/managers/mm_utils.py +43 -6
  146. sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
  147. sglang/srt/managers/multimodal_processor.py +1 -2
  148. sglang/srt/managers/overlap_utils.py +53 -0
  149. sglang/srt/managers/schedule_batch.py +167 -86
  150. sglang/srt/managers/schedule_policy.py +143 -16
  151. sglang/srt/managers/scheduler.py +359 -214
  152. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  153. sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
  154. sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
  155. sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
  156. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  157. sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
  158. sglang/srt/managers/tokenizer_manager.py +84 -136
  159. sglang/srt/managers/tp_worker.py +39 -29
  160. sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
  161. sglang/srt/managers/utils.py +1 -45
  162. sglang/srt/mem_cache/allocator.py +14 -20
  163. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  164. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  165. sglang/srt/mem_cache/chunk_cache.py +8 -1
  166. sglang/srt/mem_cache/evict_policy.py +23 -0
  167. sglang/srt/mem_cache/hicache_storage.py +40 -1
  168. sglang/srt/mem_cache/hiradix_cache.py +119 -32
  169. sglang/srt/mem_cache/memory_pool.py +188 -10
  170. sglang/srt/mem_cache/memory_pool_host.py +134 -182
  171. sglang/srt/mem_cache/radix_cache.py +222 -71
  172. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  173. sglang/srt/mem_cache/storage/__init__.py +10 -0
  174. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  175. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  176. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  177. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  178. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  179. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
  180. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
  181. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
  182. sglang/srt/mem_cache/swa_radix_cache.py +25 -34
  183. sglang/srt/metrics/collector.py +82 -120
  184. sglang/srt/metrics/func_timer.py +2 -7
  185. sglang/srt/metrics/utils.py +8 -1
  186. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  187. sglang/srt/model_executor/cuda_graph_runner.py +39 -32
  188. sglang/srt/model_executor/forward_batch_info.py +23 -38
  189. sglang/srt/model_executor/model_runner.py +131 -183
  190. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  191. sglang/srt/model_loader/loader.py +14 -10
  192. sglang/srt/model_loader/weight_utils.py +156 -2
  193. sglang/srt/models/bailing_moe.py +27 -4
  194. sglang/srt/models/deepseek_nextn.py +6 -1
  195. sglang/srt/models/deepseek_v2.py +536 -153
  196. sglang/srt/models/dots_ocr.py +173 -0
  197. sglang/srt/models/falcon_h1.py +576 -0
  198. sglang/srt/models/gemma3_causal.py +0 -2
  199. sglang/srt/models/gemma3_mm.py +1 -1
  200. sglang/srt/models/gemma3n_mm.py +1 -1
  201. sglang/srt/models/glm4_moe.py +3 -3
  202. sglang/srt/models/glm4_moe_nextn.py +2 -2
  203. sglang/srt/models/glm4v.py +1 -1
  204. sglang/srt/models/glm4v_moe.py +1 -1
  205. sglang/srt/models/gpt_oss.py +7 -30
  206. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  207. sglang/srt/models/llama.py +4 -0
  208. sglang/srt/models/longcat_flash.py +1 -1
  209. sglang/srt/models/longcat_flash_nextn.py +1 -1
  210. sglang/srt/models/mllama4.py +15 -4
  211. sglang/srt/models/qwen2.py +0 -7
  212. sglang/srt/models/qwen2_5_vl.py +2 -2
  213. sglang/srt/models/qwen2_audio.py +1 -1
  214. sglang/srt/models/qwen2_moe.py +64 -1
  215. sglang/srt/models/qwen2_vl.py +1 -1
  216. sglang/srt/models/qwen3.py +18 -3
  217. sglang/srt/models/qwen3_moe.py +31 -3
  218. sglang/srt/models/qwen3_next.py +36 -9
  219. sglang/srt/models/qwen3_vl.py +787 -0
  220. sglang/srt/models/qwen3_vl_moe.py +471 -0
  221. sglang/srt/models/registry.py +15 -3
  222. sglang/srt/models/sarashina2_vision.py +269 -0
  223. sglang/srt/models/solar.py +505 -0
  224. sglang/srt/models/starcoder2.py +357 -0
  225. sglang/srt/models/torch_native_llama.py +9 -2
  226. sglang/srt/models/utils.py +51 -0
  227. sglang/srt/multimodal/processors/base_processor.py +15 -7
  228. sglang/srt/multimodal/processors/dots_vlm.py +2 -3
  229. sglang/srt/multimodal/processors/internvl.py +20 -8
  230. sglang/srt/multimodal/processors/qwen_vl.py +8 -1
  231. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  232. sglang/srt/parser/jinja_template_utils.py +6 -0
  233. sglang/srt/sampling/sampling_batch_info.py +20 -2
  234. sglang/srt/sampling/sampling_params.py +7 -0
  235. sglang/srt/server_args.py +753 -295
  236. sglang/srt/server_args_config_parser.py +146 -0
  237. sglang/srt/single_batch_overlap.py +151 -0
  238. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  239. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  240. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  241. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  242. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  243. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  244. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
  245. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
  246. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
  247. sglang/srt/speculative/eagle_worker.py +57 -25
  248. sglang/srt/speculative/ngram_utils.py +428 -0
  249. sglang/srt/speculative/ngram_worker.py +245 -0
  250. sglang/srt/speculative/spec_info.py +47 -0
  251. sglang/srt/speculative/spec_utils.py +606 -0
  252. sglang/srt/torch_memory_saver_adapter.py +5 -7
  253. sglang/srt/tracing/trace.py +32 -6
  254. sglang/srt/two_batch_overlap.py +8 -5
  255. sglang/srt/utils/__init__.py +2 -0
  256. sglang/srt/{utils.py → utils/common.py} +399 -74
  257. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
  258. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  259. sglang/srt/utils/rpd_utils.py +452 -0
  260. sglang/srt/utils/slow_rank_detector.py +71 -0
  261. sglang/srt/warmup.py +8 -4
  262. sglang/srt/weight_sync/utils.py +1 -1
  263. sglang/test/get_logits_ut.py +57 -0
  264. sglang/test/run_eval.py +79 -11
  265. sglang/test/runners.py +1 -1
  266. sglang/test/simple_eval_common.py +5 -2
  267. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  268. sglang/test/test_block_fp8.py +2 -2
  269. sglang/test/test_deterministic.py +297 -0
  270. sglang/test/test_disaggregation_utils.py +12 -1
  271. sglang/test/test_programs.py +1 -1
  272. sglang/test/test_utils.py +355 -4
  273. sglang/utils.py +10 -1
  274. sglang/version.py +1 -1
  275. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
  276. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
  277. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  278. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  279. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  280. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  281. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  282. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -16,12 +16,14 @@
16
16
  import time
17
17
  import uuid
18
18
  from dataclasses import dataclass
19
- from typing import Any, Dict, List, Optional, TypeAlias, Union
19
+ from typing import Any, Dict, List, NamedTuple, Optional, TypeAlias, Union
20
20
 
21
21
  from openai.types.responses import (
22
22
  ResponseFunctionToolCall,
23
23
  ResponseInputItemParam,
24
24
  ResponseOutputItem,
25
+ ResponseOutputMessage,
26
+ ResponseOutputText,
25
27
  ResponseReasoningItem,
26
28
  )
27
29
  from openai.types.responses.response import ToolChoice
@@ -228,9 +230,15 @@ class CompletionRequest(BaseModel):
228
230
 
229
231
  # For request id
230
232
  rid: Optional[Union[List[str], str]] = None
233
+ # Extra key for classifying the request (e.g. cache_salt)
234
+ extra_key: Optional[Union[List[str], str]] = None
235
+ # Cache salt for request caching
236
+ cache_salt: Optional[Union[List[str], str]] = None
237
+ # Priority for the request
238
+ priority: Optional[int] = None
231
239
 
232
- # For customer metric labels
233
- customer_labels: Optional[Dict[str, str]] = None
240
+ # For custom metric labels
241
+ custom_labels: Optional[Dict[str, str]] = None
234
242
 
235
243
  @field_validator("max_tokens")
236
244
  @classmethod
@@ -337,7 +345,7 @@ class FunctionResponse(BaseModel):
337
345
  """Function response."""
338
346
 
339
347
  name: Optional[str] = None
340
- arguments: Optional[str] = None
348
+ arguments: Optional[str | Dict[str, Any]] = None
341
349
 
342
350
 
343
351
  class ToolCall(BaseModel):
@@ -386,7 +394,7 @@ class Function(BaseModel):
386
394
  """Function descriptions."""
387
395
 
388
396
  description: Optional[str] = Field(default=None, examples=[None])
389
- name: Optional[str] = None
397
+ name: str
390
398
  parameters: Optional[object] = None
391
399
  strict: bool = False
392
400
 
@@ -543,6 +551,12 @@ class ChatCompletionRequest(BaseModel):
543
551
 
544
552
  # For request id
545
553
  rid: Optional[Union[List[str], str]] = None
554
+ # Extra key for classifying the request (e.g. cache_salt)
555
+ extra_key: Optional[Union[List[str], str]] = None
556
+ # Cache salt for request caching
557
+ cache_salt: Optional[Union[List[str], str]] = None
558
+ # Priority for the request
559
+ priority: Optional[int] = None
546
560
 
547
561
  # For PD disaggregation
548
562
  bootstrap_host: Optional[Union[List[str], str]] = None
@@ -644,6 +658,8 @@ class EmbeddingRequest(BaseModel):
644
658
 
645
659
  # The request id.
646
660
  rid: Optional[Union[List[str], str]] = None
661
+ # Priority for the request
662
+ priority: Optional[int] = None
647
663
 
648
664
 
649
665
  class EmbeddingObject(BaseModel):
@@ -772,6 +788,13 @@ class ResponsesRequest(BaseModel):
772
788
  description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
773
789
  )
774
790
  priority: int = Field(default=0, description="Request priority")
791
+ extra_key: Optional[str] = Field(
792
+ default=None,
793
+ description="Extra key for classifying the request (e.g. cache_salt)",
794
+ )
795
+ cache_salt: Optional[str] = Field(
796
+ default=None, description="Cache salt for request caching"
797
+ )
775
798
 
776
799
  # SGLang-specific sampling parameters
777
800
  frequency_penalty: float = 0.0
@@ -860,6 +883,26 @@ class ResponsesResponse(BaseModel):
860
883
  tool_choice: str = "auto"
861
884
  tools: List[ResponseTool] = Field(default_factory=list)
862
885
 
886
+ # OpenAI compatibility fields. not all are used at the moment.
887
+ # Recommend checking https://platform.openai.com/docs/api-reference/responses
888
+ error: Optional[dict] = None
889
+ incomplete_details: Optional[dict] = None # TODO(v) support this input
890
+ instructions: Optional[str] = None
891
+ max_output_tokens: Optional[int] = None
892
+ previous_response_id: Optional[str] = None
893
+ reasoning: Optional[dict] = (
894
+ # Unused. No model supports this. For GPT-oss, system prompt sets
895
+ # the field, not server args.
896
+ None # {"effort": Optional[str], "summary": Optional[str]}
897
+ )
898
+ store: Optional[bool] = None
899
+ temperature: Optional[float] = None
900
+ text: Optional[dict] = None # e.g. {"format": {"type": "text"}}
901
+ top_p: Optional[float] = None
902
+ truncation: Optional[str] = None
903
+ user: Optional[str] = None
904
+ metadata: Optional[Dict[str, Any]] = None
905
+
863
906
  @classmethod
864
907
  def from_request(
865
908
  cls,
@@ -874,6 +917,41 @@ class ResponsesResponse(BaseModel):
874
917
  usage: Optional[UsageInfo],
875
918
  ) -> "ResponsesResponse":
876
919
  """Create a response from a request."""
920
+
921
+ # Determine if the output is plain text only to set text.format
922
+ def _is_text_only(
923
+ items: List[
924
+ Union[
925
+ ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
926
+ ]
927
+ ]
928
+ ) -> bool:
929
+ if not items:
930
+ return False
931
+ for it in items:
932
+ # tool call -> not pure text.
933
+ if isinstance(it, ResponseReasoningItem) or isinstance(
934
+ it, ResponseFunctionToolCall
935
+ ):
936
+ return False
937
+ try:
938
+ if isinstance(it, ResponseOutputText):
939
+ continue
940
+ elif isinstance(it, ResponseOutputMessage):
941
+ if not it.content:
942
+ continue
943
+ for c in it.content:
944
+ if not isinstance(c, ResponseOutputText):
945
+ return False
946
+ else:
947
+ # Unknown type, not considered text-only
948
+ return False
949
+ except AttributeError:
950
+ return False
951
+ return True
952
+
953
+ text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
954
+
877
955
  return cls(
878
956
  id=request.request_id,
879
957
  created_at=created_time,
@@ -884,6 +962,23 @@ class ResponsesResponse(BaseModel):
884
962
  parallel_tool_calls=request.parallel_tool_calls or True,
885
963
  tool_choice=request.tool_choice,
886
964
  tools=request.tools,
965
+ # fields for parity with v1/responses
966
+ error=None,
967
+ incomplete_details=None,
968
+ instructions=request.instructions,
969
+ max_output_tokens=request.max_output_tokens,
970
+ previous_response_id=request.previous_response_id, # TODO(v): ensure this is propagated if retrieved from store
971
+ reasoning={
972
+ "effort": request.reasoning.effort if request.reasoning else None,
973
+ "summary": None, # unused
974
+ },
975
+ store=request.store,
976
+ temperature=request.temperature,
977
+ text=text_format, # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
978
+ top_p=request.top_p,
979
+ truncation=request.truncation,
980
+ user=request.user,
981
+ metadata=request.metadata or {},
887
982
  )
888
983
 
889
984
 
@@ -922,6 +1017,16 @@ class MessageProcessingResult:
922
1017
  tool_call_constraint: Optional[Any] = None
923
1018
 
924
1019
 
1020
+ class ToolCallProcessingResult(NamedTuple):
1021
+ """Result of processing tool calls in a response."""
1022
+
1023
+ tool_calls: Optional[
1024
+ List[Any]
1025
+ ] # List of ToolCall objects or None if parsing failed
1026
+ remaining_text: str # Text remaining after parsing tool calls
1027
+ finish_reason: Dict[str, Any] # Updated finish reason dictionary
1028
+
1029
+
925
1030
  class ResponseReasoningTextContent(BaseModel):
926
1031
  text: str
927
1032
  type: Literal["reasoning_text"] = "reasoning_text"
@@ -27,10 +27,10 @@ class OpenAIServingBase(ABC):
27
27
  self.tokenizer_manager = tokenizer_manager
28
28
  self.allowed_custom_labels = (
29
29
  set(
30
- self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
30
+ self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
31
31
  )
32
32
  if isinstance(self.tokenizer_manager.server_args, ServerArgs)
33
- and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_customer_labels
33
+ and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
34
34
  else None
35
35
  )
36
36
 
@@ -62,6 +62,12 @@ class OpenAIServingBase(ABC):
62
62
  return self.create_error_response(
63
63
  message=e.detail, err_type=str(e.status_code), status_code=e.status_code
64
64
  )
65
+ except ValueError as e:
66
+ return self.create_error_response(
67
+ message=str(e),
68
+ err_type="BadRequest",
69
+ status_code=400,
70
+ )
65
71
  except Exception as e:
66
72
  logger.exception(f"Error in request: {e}")
67
73
  return self.create_error_response(
@@ -86,6 +92,19 @@ class OpenAIServingBase(ABC):
86
92
 
87
93
  return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
88
94
 
95
+ def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
96
+ """Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
97
+ parts = []
98
+ for key in ["cache_salt", "extra_key"]:
99
+ value = getattr(request, key, None)
100
+ if value:
101
+ if not isinstance(value, str):
102
+ raise TypeError(
103
+ f"Value of {key} must be a string, but got {type(value).__name__}"
104
+ )
105
+ parts.append(value)
106
+ return "".join(parts) if parts else None
107
+
89
108
  @abstractmethod
90
109
  def _convert_to_internal_request(
91
110
  self,
@@ -165,14 +184,14 @@ class OpenAIServingBase(ABC):
165
184
  )
166
185
  return json.dumps({"error": error.model_dump()})
167
186
 
168
- def extract_customer_labels(self, raw_request):
187
+ def extract_custom_labels(self, raw_request):
169
188
  if (
170
189
  not self.allowed_custom_labels
171
190
  or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
172
191
  ):
173
192
  return None
174
193
 
175
- customer_labels = None
194
+ custom_labels = None
176
195
  header = (
177
196
  self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
178
197
  )
@@ -187,9 +206,9 @@ class OpenAIServingBase(ABC):
187
206
  raw_labels = None
188
207
 
189
208
  if isinstance(raw_labels, dict):
190
- customer_labels = {
209
+ custom_labels = {
191
210
  label: value
192
211
  for label, value in raw_labels.items()
193
212
  if label in self.allowed_custom_labels
194
213
  }
195
- return customer_labels
214
+ return custom_labels
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Uni
9
9
 
10
10
  from fastapi import Request
11
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
12
+ from jsonschema import Draft202012Validator, SchemaError
12
13
 
13
14
  from sglang.srt.entrypoints.openai.protocol import (
14
15
  ChatCompletionRequest,
@@ -25,6 +26,8 @@ from sglang.srt.entrypoints.openai.protocol import (
25
26
  LogProbs,
26
27
  MessageProcessingResult,
27
28
  ToolCall,
29
+ ToolCallProcessingResult,
30
+ ToolChoice,
28
31
  TopLogprob,
29
32
  )
30
33
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
@@ -33,7 +36,10 @@ from sglang.srt.entrypoints.openai.utils import (
33
36
  process_hidden_states_from_ret,
34
37
  to_openai_style_logprobs,
35
38
  )
39
+ from sglang.srt.function_call.core_types import ToolCallItem
36
40
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
41
+ from sglang.srt.function_call.json_array_parser import JsonArrayParser
42
+ from sglang.srt.function_call.utils import get_json_schema_constraint
37
43
  from sglang.srt.managers.io_struct import GenerateReqInput
38
44
  from sglang.srt.parser.conversation import generate_chat_conv
39
45
  from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
@@ -58,6 +64,7 @@ class OpenAIServingChat(OpenAIServingBase):
58
64
  super().__init__(tokenizer_manager)
59
65
  self.template_manager = template_manager
60
66
  self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
67
+ self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
61
68
 
62
69
  def _request_id_prefix(self) -> str:
63
70
  return "chatcmpl-"
@@ -74,6 +81,23 @@ class OpenAIServingChat(OpenAIServingBase):
74
81
  ):
75
82
  return "Tools cannot be empty if tool choice is set to required."
76
83
 
84
+ if request.tool_choice is not None and not isinstance(request.tool_choice, str):
85
+ if not request.tools:
86
+ return "Tools cannot be empty if tool choice is set to a specific tool."
87
+ tool_name = request.tool_choice.function.name
88
+ tool_exists = any(tool.function.name == tool_name for tool in request.tools)
89
+ if not tool_exists:
90
+ return f"Tool '{tool_name}' not found in tools list."
91
+
92
+ # Validate tool definitions
93
+ for i, tool in enumerate(request.tools or []):
94
+ if tool.function.parameters is None:
95
+ continue
96
+ try:
97
+ Draft202012Validator.check_schema(tool.function.parameters)
98
+ except SchemaError as e:
99
+ return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
100
+
77
101
  max_output_tokens = request.max_completion_tokens or request.max_tokens
78
102
  server_context_length = self.tokenizer_manager.server_args.context_length
79
103
  if (
@@ -128,8 +152,8 @@ class OpenAIServingChat(OpenAIServingBase):
128
152
  else:
129
153
  prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
130
154
 
131
- # Extract customer labels from raw request headers
132
- customer_labels = self.extract_customer_labels(raw_request)
155
+ # Extract custom labels from raw request headers
156
+ custom_labels = self.extract_custom_labels(raw_request)
133
157
 
134
158
  adapted_request = GenerateReqInput(
135
159
  **prompt_kwargs,
@@ -149,7 +173,9 @@ class OpenAIServingChat(OpenAIServingBase):
149
173
  bootstrap_room=request.bootstrap_room,
150
174
  return_hidden_states=request.return_hidden_states,
151
175
  rid=request.rid,
152
- customer_labels=customer_labels,
176
+ extra_key=self._compute_extra_key(request),
177
+ priority=request.priority,
178
+ custom_labels=custom_labels,
153
179
  )
154
180
 
155
181
  return adapted_request, request
@@ -187,6 +213,14 @@ class OpenAIServingChat(OpenAIServingBase):
187
213
  tool_call_constraint = parser.get_structure_constraint(
188
214
  request.tool_choice
189
215
  )
216
+ # Handle JSON schema constraint directly for required or named tool choice
217
+ if request.tool_choice == "required" or isinstance(
218
+ request.tool_choice, ToolChoice
219
+ ):
220
+ json_schema = get_json_schema_constraint(
221
+ request.tools, request.tool_choice
222
+ )
223
+ tool_call_constraint = ("json_schema", json_schema)
190
224
 
191
225
  # Use chat template
192
226
  if self.template_manager.chat_template_name is None:
@@ -434,6 +468,10 @@ class OpenAIServingChat(OpenAIServingBase):
434
468
  sampling_params[constraint_type] = convert_json_schema_to_str(
435
469
  constraint_value.model_dump(by_alias=True)
436
470
  )
471
+ elif constraint_type == "json_schema":
472
+ sampling_params[constraint_type] = convert_json_schema_to_str(
473
+ constraint_value
474
+ )
437
475
  else:
438
476
  sampling_params[constraint_type] = constraint_value
439
477
  return sampling_params
@@ -526,10 +564,7 @@ class OpenAIServingChat(OpenAIServingBase):
526
564
  stream_buffers[index] = stream_buffer + delta
527
565
 
528
566
  # Handle reasoning content
529
- if (
530
- self.tokenizer_manager.server_args.reasoning_parser
531
- and request.separate_reasoning
532
- ):
567
+ if self.reasoning_parser and request.separate_reasoning:
533
568
  reasoning_text, delta = self._process_reasoning_stream(
534
569
  index, delta, reasoning_parser_dict, content, request
535
570
  )
@@ -719,7 +754,7 @@ class OpenAIServingChat(OpenAIServingBase):
719
754
 
720
755
  # Handle reasoning content
721
756
  reasoning_text = None
722
- reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
757
+ reasoning_parser = self.reasoning_parser
723
758
  if reasoning_parser and request.separate_reasoning:
724
759
  is_force_reasoning = (
725
760
  self.template_manager.force_reasoning
@@ -747,8 +782,13 @@ class OpenAIServingChat(OpenAIServingBase):
747
782
  and request.tools
748
783
  and self.tool_call_parser
749
784
  ):
785
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
750
786
  tool_calls, text, finish_reason = self._process_tool_calls(
751
- text, request.tools, finish_reason
787
+ text,
788
+ request.tools,
789
+ finish_reason,
790
+ request.tool_choice,
791
+ history_tool_calls_cnt,
752
792
  )
753
793
 
754
794
  choice_data = ChatCompletionResponseChoice(
@@ -838,13 +878,76 @@ class OpenAIServingChat(OpenAIServingBase):
838
878
  token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
839
879
  return ChoiceLogprobs(content=token_logprobs)
840
880
 
881
+ def _process_tool_call_id(
882
+ self,
883
+ call_item: ToolCallItem,
884
+ history_tool_calls_cnt: int,
885
+ ) -> str:
886
+ """Process for generating a new and unique `tool_call_id`"""
887
+ if self.tool_call_parser != "kimi_k2":
888
+ # A simple uuid is sufficient for all models except for Kimi-K2.
889
+ tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
890
+ return tool_call_id
891
+ else:
892
+ # Align with Kimi-K2 format: functions.{name}:{index}
893
+ # Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
894
+ # Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
895
+ tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
896
+ logger.debug(
897
+ f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
898
+ )
899
+ return tool_call_id
900
+
841
901
  def _process_tool_calls(
842
902
  self,
843
903
  text: str,
844
904
  tools: List[Any],
845
905
  finish_reason: Dict[str, Any],
846
- ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
906
+ tool_choice: Optional[Union[str, ToolChoice]] = None,
907
+ history_tool_calls_cnt: int = 0,
908
+ ) -> ToolCallProcessingResult:
847
909
  """Process tool calls in the response"""
910
+
911
+ # Handle required or named tool choice
912
+ if tool_choice == "required" or (
913
+ isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
914
+ ):
915
+ # Set finish reason to tool_calls since we're processing tool calls
916
+ if finish_reason["type"] == "stop":
917
+ finish_reason["type"] = "tool_calls"
918
+ finish_reason["matched"] = None
919
+ try:
920
+ # For required tool choice, we expect a JSON array of tool calls
921
+ tool_call_data = json.loads(text)
922
+ tool_calls = []
923
+ for i, tool in enumerate(tool_call_data):
924
+ # Create a ToolCallItem from the JSON data
925
+ call_info = ToolCallItem(
926
+ tool_index=i, # Use the loop index as tool_index
927
+ name=tool["name"],
928
+ parameters=json.dumps(tool["parameters"], ensure_ascii=False),
929
+ )
930
+ tool_id = self._process_tool_call_id(
931
+ call_info, history_tool_calls_cnt
932
+ )
933
+ tool_calls.append(
934
+ ToolCall(
935
+ id=tool_id,
936
+ index=i,
937
+ function=FunctionResponse(
938
+ name=tool["name"],
939
+ arguments=json.dumps(
940
+ tool["parameters"], ensure_ascii=False
941
+ ),
942
+ ),
943
+ )
944
+ )
945
+ return ToolCallProcessingResult(tool_calls, "", finish_reason)
946
+ except json.JSONDecodeError as e:
947
+ logger.error(f"Tool call parsing error: {e}")
948
+ return ToolCallProcessingResult(None, text, finish_reason)
949
+
950
+ # Use parser since output is not constrained by JSON schema
848
951
  parser = FunctionCallParser(tools, self.tool_call_parser)
849
952
  if parser.has_tool_call(text):
850
953
  if finish_reason["type"] == "stop":
@@ -854,15 +957,9 @@ class OpenAIServingChat(OpenAIServingBase):
854
957
  text, call_info_list = parser.parse_non_stream(text)
855
958
  tool_calls = []
856
959
  for call_info in call_info_list:
857
- # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
858
- if (
859
- self.tool_call_parser == "kimi_k2"
860
- and call_info.name is not None
861
- ):
862
- tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
863
- else:
864
- tool_id = f"call_{uuid.uuid4().hex[:24]}"
865
-
960
+ tool_id = self._process_tool_call_id(
961
+ call_info, history_tool_calls_cnt
962
+ )
866
963
  tool_calls.append(
867
964
  ToolCall(
868
965
  id=tool_id,
@@ -872,13 +969,13 @@ class OpenAIServingChat(OpenAIServingBase):
872
969
  ),
873
970
  )
874
971
  )
875
- return tool_calls, text, finish_reason
972
+ return ToolCallProcessingResult(tool_calls, text, finish_reason)
876
973
  except Exception as e:
877
974
  logger.error(f"Tool call parsing error: {e}")
878
975
  # Return error but don't fail the whole request
879
- return None, text, finish_reason
976
+ return ToolCallProcessingResult(None, text, finish_reason)
880
977
 
881
- return None, text, finish_reason
978
+ return ToolCallProcessingResult(None, text, finish_reason)
882
979
 
883
980
  def _process_streaming_logprobs(
884
981
  self, content: Dict[str, Any], n_prev_token: int
@@ -911,13 +1008,33 @@ class OpenAIServingChat(OpenAIServingBase):
911
1008
  or self._get_enable_thinking_from_request(request)
912
1009
  )
913
1010
  reasoning_parser_dict[index] = ReasoningParser(
914
- self.tokenizer_manager.server_args.reasoning_parser,
1011
+ self.reasoning_parser,
915
1012
  request.stream_reasoning,
916
1013
  is_force_reasoning,
917
1014
  )
918
1015
  reasoning_parser = reasoning_parser_dict[index]
919
1016
  return reasoning_parser.parse_stream_chunk(delta)
920
1017
 
1018
+ def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
1019
+ """Counts the number of tool calls in the request's message history.
1020
+
1021
+ NOTE: This method is only useful for models that include self-increasing
1022
+ history tool call idx in tool calls id, such as kimi-k2
1023
+
1024
+ Args:
1025
+ request: The chat completion request object.
1026
+
1027
+ Returns:
1028
+ The total number of tool calls in the history, or 0 if not applicable.
1029
+ """
1030
+ messages = getattr(request, "messages", [])
1031
+ idx = 0
1032
+ for msg in messages:
1033
+ if msg.role == "assistant":
1034
+ tool_calls = getattr(msg, "tool_calls", None)
1035
+ idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
1036
+ return idx
1037
+
921
1038
  def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
922
1039
  """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
923
1040
 
@@ -931,11 +1048,11 @@ class OpenAIServingChat(OpenAIServingBase):
931
1048
  """
932
1049
  if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
933
1050
  # For Qwen3 models, `enable_thinking` is supported.
934
- if request.chat_template_kwargs.get("enable_thinking") is not None:
935
- return request.chat_template_kwargs.get("enable_thinking")
1051
+ if self.reasoning_parser in ["qwen3", "glm45"]:
1052
+ return request.chat_template_kwargs.get("enable_thinking", False)
936
1053
  # For DeepSeek-V3.1 models, `thinking` is supported.
937
- elif request.chat_template_kwargs.get("thinking") is not None:
938
- return request.chat_template_kwargs.get("thinking")
1054
+ elif self.reasoning_parser in ["deepseek-v3"]:
1055
+ return request.chat_template_kwargs.get("thinking", False)
939
1056
  else:
940
1057
  return False
941
1058
  return False
@@ -951,13 +1068,25 @@ class OpenAIServingChat(OpenAIServingBase):
951
1068
  ):
952
1069
  """Process tool calls in streaming response"""
953
1070
  if index not in parser_dict:
954
- parser_dict[index] = FunctionCallParser(
955
- tools=request.tools,
956
- tool_call_parser=self.tool_call_parser,
957
- )
1071
+ # Use JSON detector directly for required or named tool choice
1072
+ if request.tool_choice == "required" or isinstance(
1073
+ request.tool_choice, ToolChoice
1074
+ ):
1075
+ parser_dict[index] = JsonArrayParser()
1076
+ else:
1077
+ parser_dict[index] = FunctionCallParser(
1078
+ tools=request.tools,
1079
+ tool_call_parser=self.tool_call_parser,
1080
+ )
1081
+
958
1082
  parser = parser_dict[index]
959
1083
 
960
- normal_text, calls = parser.parse_stream_chunk(delta)
1084
+ # Handle both FunctionCallParser and JsonArrayParser
1085
+ if isinstance(parser, JsonArrayParser):
1086
+ result = parser.parse_streaming_increment(delta, request.tools)
1087
+ normal_text, calls = result.normal_text, result.calls
1088
+ else:
1089
+ normal_text, calls = parser.parse_stream_chunk(delta)
961
1090
 
962
1091
  # Yield normal text
963
1092
  if normal_text:
@@ -975,6 +1104,7 @@ class OpenAIServingChat(OpenAIServingBase):
975
1104
  yield f"data: {chunk.model_dump_json()}\n\n"
976
1105
 
977
1106
  # Yield tool calls
1107
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
978
1108
  for call_item in calls:
979
1109
  # Mark that this choice has tool calls
980
1110
  has_tool_calls[index] = True
@@ -982,11 +1112,9 @@ class OpenAIServingChat(OpenAIServingBase):
982
1112
  # Tool call ID should be generated only once per tool call
983
1113
  if call_item.name:
984
1114
  # First chunk: include ID and function name
985
- if self.tool_call_parser == "kimi_k2":
986
- # Align with Kimi-K2 format: functions.{name}:{index}
987
- tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
988
- else:
989
- tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
1115
+ tool_call_id = self._process_tool_call_id(
1116
+ call_item, history_tool_calls_cnt
1117
+ )
990
1118
  function_name = call_item.name
991
1119
  else:
992
1120
  # Subsequent chunks: null ID and name for argument deltas
@@ -1017,7 +1145,7 @@ class OpenAIServingChat(OpenAIServingBase):
1017
1145
 
1018
1146
  def _check_for_unstreamed_tool_args(
1019
1147
  self,
1020
- parser: FunctionCallParser,
1148
+ parser: Union[FunctionCallParser, JsonArrayParser],
1021
1149
  content: Dict[str, Any],
1022
1150
  request: ChatCompletionRequest,
1023
1151
  index: int,
@@ -1027,30 +1155,31 @@ class OpenAIServingChat(OpenAIServingBase):
1027
1155
  when generation finishes. This ensures tool calls are properly completed
1028
1156
  even if the model generates the final arguments in the last chunk.
1029
1157
  """
1030
- # Only check if we have tool calls and the parser has tracked data
1158
+ # Get the detector - either from FunctionCallParser or directly if json detector
1159
+ detector = parser.detector if hasattr(parser, "detector") else parser
1160
+
1161
+ # Only check if we have tool calls and the detector has tracked data
1031
1162
  if (
1032
- not hasattr(parser.detector, "prev_tool_call_arr")
1033
- or not parser.detector.prev_tool_call_arr
1163
+ not hasattr(detector, "prev_tool_call_arr")
1164
+ or not detector.prev_tool_call_arr
1034
1165
  ):
1035
1166
  return None
1036
1167
 
1037
1168
  if (
1038
- not hasattr(parser.detector, "streamed_args_for_tool")
1039
- or not parser.detector.streamed_args_for_tool
1169
+ not hasattr(detector, "streamed_args_for_tool")
1170
+ or not detector.streamed_args_for_tool
1040
1171
  ):
1041
1172
  return None
1042
1173
 
1043
1174
  # Get the last tool call that was being processed
1044
- tool_index = len(parser.detector.prev_tool_call_arr) - 1
1045
- if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
1175
+ tool_index = len(detector.prev_tool_call_arr) - 1
1176
+ if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
1046
1177
  return None
1047
1178
 
1048
1179
  # Get expected vs actual arguments
1049
- expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
1050
- "arguments", {}
1051
- )
1180
+ expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
1052
1181
  expected_call = json.dumps(expected_args, ensure_ascii=False)
1053
- actual_call = parser.detector.streamed_args_for_tool[tool_index]
1182
+ actual_call = detector.streamed_args_for_tool[tool_index]
1054
1183
 
1055
1184
  # Check if there are remaining arguments to send
1056
1185
  remaining_call = (