sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +321 -31
  3. sglang/bench_serving.py +10 -3
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +4 -0
  11. sglang/srt/configs/dots_ocr.py +64 -0
  12. sglang/srt/configs/falcon_h1.py +360 -0
  13. sglang/srt/configs/load_config.py +8 -0
  14. sglang/srt/configs/model_config.py +160 -105
  15. sglang/srt/configs/qwen3_vl.py +586 -0
  16. sglang/srt/constrained/base_grammar_backend.py +1 -0
  17. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  18. sglang/srt/constrained/xgrammar_backend.py +6 -4
  19. sglang/srt/debug_utils/dumper.py +10 -3
  20. sglang/srt/disaggregation/ascend/conn.py +2 -2
  21. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  22. sglang/srt/disaggregation/common/conn.py +266 -98
  23. sglang/srt/disaggregation/decode.py +50 -9
  24. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  26. sglang/srt/disaggregation/mooncake/conn.py +51 -541
  27. sglang/srt/disaggregation/nixl/conn.py +148 -39
  28. sglang/srt/disaggregation/prefill.py +31 -14
  29. sglang/srt/disaggregation/utils.py +36 -5
  30. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  31. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  32. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  33. sglang/srt/distributed/parallel_state.py +135 -80
  34. sglang/srt/entrypoints/engine.py +23 -3
  35. sglang/srt/entrypoints/grpc_request_manager.py +330 -55
  36. sglang/srt/entrypoints/grpc_server.py +232 -102
  37. sglang/srt/entrypoints/http_server.py +49 -9
  38. sglang/srt/entrypoints/openai/protocol.py +110 -5
  39. sglang/srt/entrypoints/openai/serving_base.py +25 -6
  40. sglang/srt/entrypoints/openai/serving_chat.py +178 -49
  41. sglang/srt/entrypoints/openai/serving_completions.py +5 -3
  42. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  43. sglang/srt/entrypoints/openai/serving_responses.py +42 -0
  44. sglang/srt/environ.py +285 -0
  45. sglang/srt/eplb/expert_location.py +30 -5
  46. sglang/srt/function_call/function_call_parser.py +3 -2
  47. sglang/srt/function_call/glm4_moe_detector.py +3 -3
  48. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  49. sglang/srt/function_call/json_array_parser.py +63 -0
  50. sglang/srt/function_call/kimik2_detector.py +17 -4
  51. sglang/srt/function_call/utils.py +96 -5
  52. sglang/srt/grpc/compile_proto.py +245 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
  56. sglang/srt/layers/activation.py +7 -6
  57. sglang/srt/layers/attention/aiter_backend.py +14 -15
  58. sglang/srt/layers/attention/ascend_backend.py +108 -9
  59. sglang/srt/layers/attention/attention_registry.py +206 -0
  60. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  61. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  62. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  63. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  66. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  67. sglang/srt/layers/attention/flashinfer_backend.py +112 -194
  68. sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
  69. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  70. sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
  71. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
  72. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
  73. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
  74. sglang/srt/layers/attention/mamba/mamba.py +566 -1
  75. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  76. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  77. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  78. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  79. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  80. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  81. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  82. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  83. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  84. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  85. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  86. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  87. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  88. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  89. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  90. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  91. sglang/srt/layers/attention/nsa/utils.py +24 -0
  92. sglang/srt/layers/attention/nsa_backend.py +887 -0
  93. sglang/srt/layers/attention/tbo_backend.py +6 -6
  94. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  95. sglang/srt/layers/attention/triton_backend.py +42 -9
  96. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  97. sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
  98. sglang/srt/layers/attention/vision.py +58 -0
  99. sglang/srt/layers/attention/wave_backend.py +4 -4
  100. sglang/srt/layers/communicator.py +8 -0
  101. sglang/srt/layers/dp_attention.py +11 -1
  102. sglang/srt/layers/elementwise.py +3 -1
  103. sglang/srt/layers/layernorm.py +2 -0
  104. sglang/srt/layers/linear.py +21 -4
  105. sglang/srt/layers/logits_processor.py +15 -2
  106. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  107. sglang/srt/layers/moe/ep_moe/layer.py +147 -74
  108. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
  113. sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
  114. sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
  115. sglang/srt/layers/moe/utils.py +10 -0
  116. sglang/srt/layers/parameter.py +23 -6
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  119. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  121. sglang/srt/layers/quantization/fp8.py +2 -2
  122. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  123. sglang/srt/layers/quantization/modelopt_quant.py +44 -9
  124. sglang/srt/layers/quantization/mxfp4.py +12 -4
  125. sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
  126. sglang/srt/layers/quantization/w4afp8.py +0 -4
  127. sglang/srt/layers/quantization/w8a8_int8.py +15 -3
  128. sglang/srt/layers/rotary_embedding.py +78 -31
  129. sglang/srt/layers/sampler.py +52 -4
  130. sglang/srt/layers/utils.py +23 -0
  131. sglang/srt/lora/backend/base_backend.py +3 -3
  132. sglang/srt/lora/backend/chunked_backend.py +348 -0
  133. sglang/srt/lora/backend/triton_backend.py +10 -4
  134. sglang/srt/lora/lora.py +7 -5
  135. sglang/srt/lora/lora_manager.py +17 -6
  136. sglang/srt/lora/mem_pool.py +1 -1
  137. sglang/srt/lora/triton_ops/__init__.py +4 -0
  138. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  139. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  140. sglang/srt/lora/utils.py +7 -5
  141. sglang/srt/managers/cache_controller.py +42 -142
  142. sglang/srt/managers/data_parallel_controller.py +11 -46
  143. sglang/srt/managers/detokenizer_manager.py +11 -11
  144. sglang/srt/managers/io_struct.py +162 -118
  145. sglang/srt/managers/mm_utils.py +43 -6
  146. sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
  147. sglang/srt/managers/multimodal_processor.py +1 -2
  148. sglang/srt/managers/overlap_utils.py +53 -0
  149. sglang/srt/managers/schedule_batch.py +167 -86
  150. sglang/srt/managers/schedule_policy.py +143 -16
  151. sglang/srt/managers/scheduler.py +359 -214
  152. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  153. sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
  154. sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
  155. sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
  156. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  157. sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
  158. sglang/srt/managers/tokenizer_manager.py +84 -136
  159. sglang/srt/managers/tp_worker.py +39 -29
  160. sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
  161. sglang/srt/managers/utils.py +1 -45
  162. sglang/srt/mem_cache/allocator.py +14 -20
  163. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  164. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  165. sglang/srt/mem_cache/chunk_cache.py +8 -1
  166. sglang/srt/mem_cache/evict_policy.py +23 -0
  167. sglang/srt/mem_cache/hicache_storage.py +40 -1
  168. sglang/srt/mem_cache/hiradix_cache.py +119 -32
  169. sglang/srt/mem_cache/memory_pool.py +188 -10
  170. sglang/srt/mem_cache/memory_pool_host.py +134 -182
  171. sglang/srt/mem_cache/radix_cache.py +222 -71
  172. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  173. sglang/srt/mem_cache/storage/__init__.py +10 -0
  174. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  175. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  176. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  177. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  178. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  179. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
  180. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
  181. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
  182. sglang/srt/mem_cache/swa_radix_cache.py +25 -34
  183. sglang/srt/metrics/collector.py +82 -120
  184. sglang/srt/metrics/func_timer.py +2 -7
  185. sglang/srt/metrics/utils.py +8 -1
  186. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  187. sglang/srt/model_executor/cuda_graph_runner.py +39 -32
  188. sglang/srt/model_executor/forward_batch_info.py +23 -38
  189. sglang/srt/model_executor/model_runner.py +131 -183
  190. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  191. sglang/srt/model_loader/loader.py +14 -10
  192. sglang/srt/model_loader/weight_utils.py +156 -2
  193. sglang/srt/models/bailing_moe.py +27 -4
  194. sglang/srt/models/deepseek_nextn.py +6 -1
  195. sglang/srt/models/deepseek_v2.py +536 -153
  196. sglang/srt/models/dots_ocr.py +173 -0
  197. sglang/srt/models/falcon_h1.py +576 -0
  198. sglang/srt/models/gemma3_causal.py +0 -2
  199. sglang/srt/models/gemma3_mm.py +1 -1
  200. sglang/srt/models/gemma3n_mm.py +1 -1
  201. sglang/srt/models/glm4_moe.py +3 -3
  202. sglang/srt/models/glm4_moe_nextn.py +2 -2
  203. sglang/srt/models/glm4v.py +1 -1
  204. sglang/srt/models/glm4v_moe.py +1 -1
  205. sglang/srt/models/gpt_oss.py +7 -30
  206. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  207. sglang/srt/models/llama.py +4 -0
  208. sglang/srt/models/longcat_flash.py +1 -1
  209. sglang/srt/models/longcat_flash_nextn.py +1 -1
  210. sglang/srt/models/mllama4.py +15 -4
  211. sglang/srt/models/qwen2.py +0 -7
  212. sglang/srt/models/qwen2_5_vl.py +2 -2
  213. sglang/srt/models/qwen2_audio.py +1 -1
  214. sglang/srt/models/qwen2_moe.py +64 -1
  215. sglang/srt/models/qwen2_vl.py +1 -1
  216. sglang/srt/models/qwen3.py +18 -3
  217. sglang/srt/models/qwen3_moe.py +31 -3
  218. sglang/srt/models/qwen3_next.py +36 -9
  219. sglang/srt/models/qwen3_vl.py +787 -0
  220. sglang/srt/models/qwen3_vl_moe.py +471 -0
  221. sglang/srt/models/registry.py +15 -3
  222. sglang/srt/models/sarashina2_vision.py +269 -0
  223. sglang/srt/models/solar.py +505 -0
  224. sglang/srt/models/starcoder2.py +357 -0
  225. sglang/srt/models/torch_native_llama.py +9 -2
  226. sglang/srt/models/utils.py +51 -0
  227. sglang/srt/multimodal/processors/base_processor.py +15 -7
  228. sglang/srt/multimodal/processors/dots_vlm.py +2 -3
  229. sglang/srt/multimodal/processors/internvl.py +20 -8
  230. sglang/srt/multimodal/processors/qwen_vl.py +8 -1
  231. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  232. sglang/srt/parser/jinja_template_utils.py +6 -0
  233. sglang/srt/sampling/sampling_batch_info.py +20 -2
  234. sglang/srt/sampling/sampling_params.py +7 -0
  235. sglang/srt/server_args.py +753 -295
  236. sglang/srt/server_args_config_parser.py +146 -0
  237. sglang/srt/single_batch_overlap.py +151 -0
  238. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  239. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  240. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  241. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  242. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  243. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  244. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
  245. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
  246. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
  247. sglang/srt/speculative/eagle_worker.py +57 -25
  248. sglang/srt/speculative/ngram_utils.py +428 -0
  249. sglang/srt/speculative/ngram_worker.py +245 -0
  250. sglang/srt/speculative/spec_info.py +47 -0
  251. sglang/srt/speculative/spec_utils.py +606 -0
  252. sglang/srt/torch_memory_saver_adapter.py +5 -7
  253. sglang/srt/tracing/trace.py +32 -6
  254. sglang/srt/two_batch_overlap.py +8 -5
  255. sglang/srt/utils/__init__.py +2 -0
  256. sglang/srt/{utils.py → utils/common.py} +399 -74
  257. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
  258. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  259. sglang/srt/utils/rpd_utils.py +452 -0
  260. sglang/srt/utils/slow_rank_detector.py +71 -0
  261. sglang/srt/warmup.py +8 -4
  262. sglang/srt/weight_sync/utils.py +1 -1
  263. sglang/test/get_logits_ut.py +57 -0
  264. sglang/test/run_eval.py +79 -11
  265. sglang/test/runners.py +1 -1
  266. sglang/test/simple_eval_common.py +5 -2
  267. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  268. sglang/test/test_block_fp8.py +2 -2
  269. sglang/test/test_deterministic.py +297 -0
  270. sglang/test/test_disaggregation_utils.py +12 -1
  271. sglang/test/test_programs.py +1 -1
  272. sglang/test/test_utils.py +355 -4
  273. sglang/utils.py +10 -1
  274. sglang/version.py +1 -1
  275. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
  276. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
  277. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  278. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  279. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  280. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  281. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  282. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -90,8 +90,8 @@ class OpenAIServingCompletion(OpenAIServingBase):
90
90
  else:
91
91
  prompt_kwargs = {"input_ids": prompt}
92
92
 
93
- # Extract customer labels from raw request headers
94
- customer_labels = self.extract_customer_labels(raw_request)
93
+ # Extract custom labels from raw request headers
94
+ custom_labels = self.extract_custom_labels(raw_request)
95
95
 
96
96
  adapted_request = GenerateReqInput(
97
97
  **prompt_kwargs,
@@ -107,7 +107,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
107
107
  bootstrap_room=request.bootstrap_room,
108
108
  return_hidden_states=request.return_hidden_states,
109
109
  rid=request.rid,
110
- customer_labels=customer_labels,
110
+ extra_key=self._compute_extra_key(request),
111
+ priority=request.priority,
112
+ custom_labels=custom_labels,
111
113
  )
112
114
 
113
115
  return adapted_request, request
@@ -125,6 +125,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
125
125
  adapted_request = EmbeddingReqInput(
126
126
  **prompt_kwargs,
127
127
  rid=request.rid,
128
+ priority=request.priority,
128
129
  )
129
130
 
130
131
  return adapted_request, request
@@ -123,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
123
123
 
124
124
  self.background_tasks: dict[str, asyncio.Task] = {}
125
125
 
126
+ # error helpers dedicated for v1/responses
127
+ def create_error_response(
128
+ self,
129
+ message: str,
130
+ err_type: str = "invalid_request_error",
131
+ status_code: int = 400,
132
+ param: Optional[str] = None,
133
+ ) -> ORJSONResponse:
134
+ nested_error = {
135
+ "message": message,
136
+ "type": err_type,
137
+ "param": param,
138
+ "code": status_code,
139
+ }
140
+ return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
141
+
142
+ def create_streaming_error_response(
143
+ self,
144
+ message: str,
145
+ err_type: str = "BadRequestError",
146
+ status_code: int = 400,
147
+ ) -> str:
148
+ return json.dumps(
149
+ {
150
+ "error": {
151
+ "message": message,
152
+ "type": err_type,
153
+ "param": None,
154
+ "code": status_code,
155
+ }
156
+ }
157
+ )
158
+
126
159
  def _request_id_prefix(self) -> str:
127
160
  return "resp_"
128
161
 
@@ -245,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
245
278
  sampling_params=sampling_params,
246
279
  stream=request.stream,
247
280
  rid=request.request_id,
281
+ extra_key=self._compute_extra_key(request),
248
282
  background=request.background,
249
283
  )
250
284
 
@@ -833,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
833
867
 
834
868
  async for ctx in result_generator:
835
869
 
870
+ # Only process context objects that implement the `is_expecting_start()` method,
871
+ # which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
872
+ # Contexts without this method are skipped, as they do not represent a new turn
873
+ # or are not compatible with per-turn handling in the /v1/responses endpoint.
874
+ if not hasattr(ctx, "is_expecting_start"):
875
+ continue
876
+
836
877
  if ctx.is_expecting_start():
837
878
  current_output_index += 1
838
879
  sent_output_item_added = False
@@ -1250,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
1250
1291
  sampling_params=sampling_params,
1251
1292
  stream=adapted_request.stream,
1252
1293
  rid=request_id,
1294
+ extra_key=adapted_request.extra_key,
1253
1295
  return_logprob=adapted_request.return_logprob,
1254
1296
  logprob_start_len=adapted_request.logprob_start_len,
1255
1297
  top_logprobs_num=adapted_request.top_logprobs_num,
sglang/srt/environ.py ADDED
@@ -0,0 +1,285 @@
1
+ import os
2
+ import subprocess
3
+ import warnings
4
+ from contextlib import ExitStack, contextmanager
5
+ from typing import Any
6
+
7
+
8
+ class EnvField:
9
+ def __init__(self, default: Any):
10
+ self.default = default
11
+ # NOTE: we use None to indicate whether the value is set or not
12
+ # If the value is manually set to None, we need mark it as _set_to_none.
13
+ # Always use clear() to reset the value, which leads to the default fallback.
14
+ self._set_to_none = False
15
+
16
+ def __set_name__(self, owner, name):
17
+ self.name = name
18
+
19
+ def parse(self, value: str) -> Any:
20
+ raise NotImplementedError()
21
+
22
+ def get(self) -> Any:
23
+ value = os.getenv(self.name)
24
+ if self._set_to_none:
25
+ assert value is None
26
+ return None
27
+
28
+ if value is None:
29
+ return self.default
30
+
31
+ try:
32
+ return self.parse(value)
33
+ except ValueError as e:
34
+ warnings.warn(
35
+ f'Invalid value for {self.name}: {e}, using default "{self.default}"'
36
+ )
37
+ return self.default
38
+
39
+ def is_set(self):
40
+ # NOTE: If None is manually set, it is considered as set.
41
+ return self.name in os.environ or self._set_to_none
42
+
43
+ def get_set_value_or(self, or_value: Any):
44
+ # NOTE: Ugly usage, but only way to get custom default value.
45
+ return self.get() if self.is_set() else or_value
46
+
47
+ def set(self, value: Any):
48
+ if value is None:
49
+ self._set_to_none = True
50
+ os.environ.pop(self.name, None)
51
+ else:
52
+ self._set_to_none = False
53
+ os.environ[self.name] = str(value)
54
+
55
+ @contextmanager
56
+ def override(self, value: Any):
57
+ backup_present = self.name in os.environ
58
+ backup_value = os.environ.get(self.name)
59
+ backup_set_to_none = self._set_to_none
60
+ self.set(value)
61
+ yield
62
+ if backup_present:
63
+ os.environ[self.name] = backup_value
64
+ else:
65
+ os.environ.pop(self.name, None)
66
+ self._set_to_none = backup_set_to_none
67
+
68
+ def clear(self):
69
+ os.environ.pop(self.name, None)
70
+ self._set_to_none = False
71
+
72
+ @property
73
+ def value(self):
74
+ return self.get()
75
+
76
+
77
+ class EnvStr(EnvField):
78
+ def parse(self, value: str) -> str:
79
+ return value
80
+
81
+
82
+ class EnvBool(EnvField):
83
+ def parse(self, value: str) -> bool:
84
+ value = value.lower()
85
+ if value in ["true", "1", "yes", "y"]:
86
+ return True
87
+ if value in ["false", "0", "no", "n"]:
88
+ return False
89
+ raise ValueError(f'"{value}" is not a valid boolean value')
90
+
91
+
92
+ class EnvInt(EnvField):
93
+ def parse(self, value: str) -> int:
94
+ try:
95
+ return int(value)
96
+ except ValueError:
97
+ raise ValueError(f'"{value}" is not a valid integer value')
98
+
99
+
100
+ class EnvFloat(EnvField):
101
+ def parse(self, value: str) -> float:
102
+ try:
103
+ return float(value)
104
+ except ValueError:
105
+ raise ValueError(f'"{value}" is not a valid float value')
106
+
107
+
108
+ class Envs:
109
+ # fmt: off
110
+
111
+ # Model & File Download
112
+ SGLANG_USE_MODELSCOPE = EnvBool(False)
113
+
114
+ # Test & Debug
115
+ SGLANG_IS_IN_CI = EnvBool(False)
116
+ SGLANG_AMD_CI = EnvBool(False)
117
+ SGLANG_TEST_RETRACT = EnvBool(False)
118
+ SGLANG_SET_CPU_AFFINITY = EnvBool(False)
119
+ SGLANG_PROFILE_WITH_STACK = EnvBool(True)
120
+ SGLANG_RECORD_STEP_TIME = EnvBool(False)
121
+ SGLANG_GC_LOG = EnvBool(False)
122
+ SGLANG_FORCE_SHUTDOWN = EnvBool(False)
123
+ SGLANG_DEBUG_MEMORY_POOL = EnvBool(False)
124
+ SGLANG_TEST_REQUEST_TIME_STATS = EnvBool(False)
125
+ SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK = EnvBool(False)
126
+ SGLANG_DISABLE_REQUEST_LOGGING = EnvBool(False)
127
+ SGLANG_SIMULATE_ACC_LEN = EnvFloat(-1)
128
+ SGLANG_SIMULATE_ACC_METHOD = EnvStr("multinomial")
129
+ SGLANG_TORCH_PROFILER_DIR = EnvStr("/tmp")
130
+
131
+ # Model Parallel
132
+ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER = EnvBool(True)
133
+
134
+ # Constrained Decoding
135
+ SGLANG_DISABLE_OUTLINES_DISK_CACHE = EnvBool(True)
136
+ SGLANG_GRAMMAR_TIMEOUT = EnvFloat(300)
137
+
138
+ # Hi-Cache
139
+ SGLANG_HICACHE_HF3FS_CONFIG_PATH = EnvStr(None)
140
+
141
+ # Mooncake KV Transfer
142
+ SGLANG_MOONCAKE_CUSTOM_MEM_POOL = EnvBool(False)
143
+ ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE = EnvBool(False)
144
+
145
+ # AMD & ROCm
146
+ SGLANG_USE_AITER = EnvBool(False)
147
+ SGLANG_ROCM_FUSED_DECODE_MLA = EnvBool(False)
148
+
149
+ # Quantization
150
+ SGLANG_INT4_WEIGHT = EnvBool(False)
151
+ SGLANG_CPU_QUANTIZATION = EnvBool(False)
152
+ SGLANG_USE_DYNAMIC_MXFP4_LINEAR = EnvBool(False)
153
+ SGLANG_FORCE_FP8_MARLIN = EnvBool(False)
154
+
155
+ # Flashinfer
156
+ SGLANG_IS_FLASHINFER_AVAILABLE = EnvBool(True)
157
+ SGLANG_ENABLE_FLASHINFER_GEMM = EnvBool(False)
158
+
159
+ # Triton
160
+ SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS = EnvBool(False)
161
+
162
+ # Torch Compile
163
+ SGLANG_ENABLE_TORCH_COMPILE = EnvBool(False)
164
+
165
+ # EPLB
166
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT = EnvBool(False)
167
+ SGLANG_EXPERT_LOCATION_UPDATER_CANARY = EnvBool(False)
168
+ SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS = EnvBool(False)
169
+ SGLANG_LOG_EXPERT_LOCATION_METADATA = EnvBool(False)
170
+
171
+ # TBO
172
+ SGLANG_TBO_DEBUG = EnvBool(False)
173
+
174
+ # DeepGemm
175
+ SGLANG_ENABLE_JIT_DEEPGEMM = EnvBool(True)
176
+ SGLANG_JIT_DEEPGEMM_PRECOMPILE = EnvBool(True)
177
+ SGLANG_JIT_DEEPGEMM_COMPILE_WORKERS = EnvInt(4)
178
+ SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE = EnvBool(False)
179
+ SGLANG_DG_CACHE_DIR = EnvStr(os.path.expanduser("~/.cache/deep_gemm"))
180
+ SGLANG_DG_USE_NVRTC = EnvBool(False)
181
+ SGLANG_USE_DEEPGEMM_BMM = EnvBool(False)
182
+
183
+ # sgl-kernel
184
+ SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK = EnvBool(False)
185
+
186
+ # vLLM dependencies
187
+ USE_VLLM_CUSTOM_ALLREDUCE = EnvBool(False)
188
+ USE_VLLM_CUTLASS_W8A8_FP8_KERNEL = EnvBool(False)
189
+
190
+ USE_TRITON_W8A8_FP8_KERNEL = EnvBool(False)
191
+ RETURN_ORIGINAL_LOGPROB = EnvBool(False)
192
+ SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN = EnvBool(False)
193
+ SGLANG_MOE_PADDING = EnvBool(False)
194
+ SGLANG_CUTLASS_MOE = EnvBool(False)
195
+ HF_HUB_DISABLE_XET = EnvBool(False)
196
+ DISABLE_OPENAPI_DOC = EnvBool(False)
197
+ SGLANG_ENABLE_TORCH_INFERENCE_MODE = EnvBool(False)
198
+ SGLANG_IS_FIRST_RANK_ON_NODE = EnvBool(True)
199
+ SGLANG_SUPPORT_CUTLASS_BLOCK_FP8 = EnvBool(False)
200
+ SGLANG_SYNC_TOKEN_IDS_ACROSS_TP = EnvBool(False)
201
+ SGLANG_ENABLE_COLOCATED_BATCH_GEN = EnvBool(False)
202
+
203
+ # Deterministic inference
204
+ SGLANG_ENABLE_DETERMINISTIC_INFERENCE = EnvBool(False)
205
+ SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE = EnvInt(4096)
206
+ SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE = EnvInt(2048)
207
+ SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE = EnvInt(4096)
208
+ SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE = EnvInt(256)
209
+
210
+ # fmt: on
211
+
212
+
213
+ envs = Envs()
214
+
215
+
216
+ def _convert_SGL_to_SGLANG():
217
+ for key, value in os.environ.items():
218
+ if key.startswith("SGL_"):
219
+ new_key = key.replace("SGL_", "SGLANG_", 1)
220
+ warnings.warn(
221
+ f"Environment variable {key} is deprecated, please use {new_key}"
222
+ )
223
+ os.environ[new_key] = value
224
+
225
+
226
+ _convert_SGL_to_SGLANG()
227
+
228
+
229
+ def example_with_exit_stack():
230
+ # Use this style of context manager in unit test
231
+ exit_stack = ExitStack()
232
+ exit_stack.enter_context(envs.SGLANG_TEST_RETRACT.override(False))
233
+ assert envs.SGLANG_TEST_RETRACT.value is False
234
+ exit_stack.close()
235
+ assert envs.SGLANG_TEST_RETRACT.value is None
236
+
237
+
238
+ def example_with_subprocess():
239
+ command = ["python", "-c", "import os; print(os.getenv('SGLANG_TEST_RETRACT'))"]
240
+ with envs.SGLANG_TEST_RETRACT.override(True):
241
+ process = subprocess.Popen(
242
+ command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
243
+ )
244
+ process.wait()
245
+ output = process.stdout.read().decode("utf-8").strip()
246
+ assert output == "True"
247
+
248
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
249
+ output = process.stdout.read().decode("utf-8").strip()
250
+ assert output == "None"
251
+
252
+
253
+ def examples():
254
+ # Example usage for envs
255
+ envs.SGLANG_TEST_RETRACT.clear()
256
+ assert envs.SGLANG_TEST_RETRACT.value is False
257
+
258
+ envs.SGLANG_TEST_RETRACT.set(None)
259
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
260
+
261
+ envs.SGLANG_TEST_RETRACT.clear()
262
+ assert not envs.SGLANG_TEST_RETRACT.is_set()
263
+
264
+ envs.SGLANG_TEST_RETRACT.set(True)
265
+ assert envs.SGLANG_TEST_RETRACT.value is True
266
+
267
+ with envs.SGLANG_TEST_RETRACT.override(None):
268
+ assert (
269
+ envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
270
+ )
271
+
272
+ assert envs.SGLANG_TEST_RETRACT.value is True
273
+
274
+ envs.SGLANG_TEST_RETRACT.set(None)
275
+ with envs.SGLANG_TEST_RETRACT.override(True):
276
+ assert envs.SGLANG_TEST_RETRACT.value is True
277
+
278
+ assert envs.SGLANG_TEST_RETRACT.is_set() and envs.SGLANG_TEST_RETRACT.value is None
279
+
280
+ example_with_exit_stack()
281
+ example_with_subprocess()
282
+
283
+
284
+ if __name__ == "__main__":
285
+ examples()
@@ -231,6 +231,7 @@ class ExpertLocationMetadata:
231
231
  logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
232
232
  logical_to_rank_dispatch_physical_map=(
233
233
  compute_logical_to_rank_dispatch_physical_map(
234
+ server_args=server_args,
234
235
  logical_to_all_physical_map=logical_to_all_physical_map,
235
236
  num_gpus=ep_size,
236
237
  num_physical_experts=num_physical_experts,
@@ -340,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
340
341
 
341
342
  # TODO optimize performance (rewrite and/or run in separate process with overlap)
342
343
  def compute_logical_to_rank_dispatch_physical_map(
344
+ server_args: ServerArgs,
343
345
  logical_to_all_physical_map: torch.Tensor,
344
346
  num_gpus: int,
345
347
  num_physical_experts: int,
@@ -348,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
348
350
  ):
349
351
  r = random.Random(seed)
350
352
 
351
- num_local_physical_experts = num_physical_experts // num_gpus
353
+ num_local_gpu_physical_experts = num_physical_experts // num_gpus
354
+ num_gpus_per_node = server_args.ep_size // server_args.nnodes
355
+ num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
352
356
  num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
353
357
  dtype = logical_to_all_physical_map.dtype
354
358
 
@@ -372,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
372
376
  physical_expert_id
373
377
  for physical_expert_id in candidate_physical_expert_ids
374
378
  if _compute_gpu_id_of_physical_expert(
375
- physical_expert_id, num_local_physical_experts
379
+ physical_expert_id, num_local_gpu_physical_experts
376
380
  )
377
381
  == gpu_id
378
382
  ]
379
383
  if len(same_gpu_physical_expert_ids) > 0:
384
+ # 1. Prefer same-GPU experts
380
385
  output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
381
-
386
+ else:
387
+ # 2. Otherwise, prefer same-node experts
388
+ node_id = gpu_id // num_gpus_per_node
389
+ same_node_physical_expert_ids = [
390
+ physical_expert_id
391
+ for physical_expert_id in candidate_physical_expert_ids
392
+ if _compute_node_id_of_physical_expert(
393
+ physical_expert_id, num_local_node_physical_experts
394
+ )
395
+ == node_id
396
+ ]
397
+ if len(same_node_physical_expert_ids) > 0:
398
+ output_partial[gpu_id] = same_node_physical_expert_ids[0]
399
+
400
+ # 3. Fill remaining slots with fair random choices
382
401
  num_remain = torch.sum(output_partial == -1).item()
383
402
  output_partial[output_partial == -1] = torch.tensor(
384
403
  _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
@@ -404,9 +423,15 @@ def _logical_to_all_physical_raw(
404
423
 
405
424
 
406
425
  def _compute_gpu_id_of_physical_expert(
407
- physical_expert_id: int, num_local_physical_experts: int
426
+ physical_expert_id: int, num_local_gpu_physical_experts: int
427
+ ) -> int:
428
+ return physical_expert_id // num_local_gpu_physical_experts
429
+
430
+
431
+ def _compute_node_id_of_physical_expert(
432
+ physical_expert_id: int, num_local_host_physical_experts: int
408
433
  ) -> int:
409
- return physical_expert_id // num_local_physical_experts
434
+ return physical_expert_id // num_local_host_physical_experts
410
435
 
411
436
 
412
437
  def _fair_choices(arr: List, k: int, r: random.Random) -> List:
@@ -20,6 +20,7 @@ from sglang.srt.function_call.pythonic_detector import PythonicDetector
20
20
  from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
21
21
  from sglang.srt.function_call.qwen25_detector import Qwen25Detector
22
22
  from sglang.srt.function_call.step3_detector import Step3Detector
23
+ from sglang.srt.function_call.utils import get_json_schema_constraint
23
24
 
24
25
  logger = logging.getLogger(__name__)
25
26
 
@@ -178,8 +179,8 @@ class FunctionCallParser:
178
179
  strict_tag = self.get_structure_tag()
179
180
  return ("structural_tag", strict_tag)
180
181
  elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
181
- ebnf = self.get_ebnf(tool_choice)
182
- return ("ebnf", ebnf) if ebnf is not None else None
182
+ json_schema = get_json_schema_constraint(self.tools, tool_choice)
183
+ return ("json_schema", json_schema)
183
184
 
184
185
  def get_ebnf(
185
186
  self, tool_choice: Union[ToolChoice, Literal["required"]]
@@ -39,7 +39,7 @@ def parse_arguments(json_value):
39
39
 
40
40
  class Glm4MoeDetector(BaseFormatDetector):
41
41
  """
42
- Detector for GLM-4.5 models.
42
+ Detector for GLM-4.5 and GLM-4.6 models.
43
43
  Assumes function call format:
44
44
  <tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
45
45
  """
@@ -53,7 +53,7 @@ class Glm4MoeDetector(BaseFormatDetector):
53
53
  self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
54
54
 
55
55
  def has_tool_call(self, text: str) -> bool:
56
- """Check if the text contains a glm-4.5 format tool call."""
56
+ """Check if the text contains a glm-4.5 / glm-4.6 format tool call."""
57
57
  return self.bot_token in text
58
58
 
59
59
  def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
@@ -102,7 +102,7 @@ class Glm4MoeDetector(BaseFormatDetector):
102
102
  self, new_text: str, tools: List[Tool]
103
103
  ) -> StreamingParseResult:
104
104
  """
105
- Streaming incremental parsing tool calls for GLM-4.5 format.
105
+ Streaming incremental parsing tool calls for GLM-4.5 and GLM-4.6 format.
106
106
  """
107
107
  self._buffer += new_text
108
108
  current_text = self._buffer
@@ -81,6 +81,29 @@ class GptOssDetector(BaseFormatDetector):
81
81
  # Always use HarmonyParser for parsing to ensure proper filtering
82
82
  events = self.harmony_parser.parse(new_text)
83
83
 
84
+ # If there are no parsed events and the chunk contains no Harmony structural
85
+ # markers, treat it as plain text and pass it through. This fixes a bug where
86
+ # normal content was held in the buffer when tools were provided but not used.
87
+ if not events:
88
+ has_harmony_markers = any(
89
+ marker in self._buffer
90
+ for marker in (
91
+ "<|start|>",
92
+ "<|channel|>",
93
+ "<|message|>",
94
+ "<|constrain|>",
95
+ "<|end|>",
96
+ "<|call|>",
97
+ "<|return|>",
98
+ "assistantfinal",
99
+ )
100
+ )
101
+ if not has_harmony_markers:
102
+ # Plain text with no tool markers — emit as normal content
103
+ out = self._buffer
104
+ self._buffer = ""
105
+ return StreamingParseResult(normal_text=out, calls=[])
106
+
84
107
  # Quick check if we might have tool calls
85
108
  if (
86
109
  "<|channel|>commentary to=" not in self._buffer
@@ -0,0 +1,63 @@
1
+ import json
2
+ import re
3
+ from typing import List
4
+
5
+ from sglang.srt.entrypoints.openai.protocol import Tool
6
+ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
7
+ from sglang.srt.function_call.core_types import StreamingParseResult
8
+
9
+
10
+ class JsonArrayParser(BaseFormatDetector):
11
+ """
12
+ Parser for JSON array tool calls when JSON schema constraints are active.
13
+
14
+ This parser is used when tool_choice="required" or a specific tool is named,
15
+ bypassing model-specific parsers in favor of direct JSON array parsing.
16
+ """
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+ # Configure for JSON array parsing
21
+ self.bot_token = "["
22
+ self.eot_token = "]"
23
+ self.tool_call_separator = ","
24
+
25
+ def has_tool_call(self, text: str) -> bool:
26
+ """
27
+ Check if the given text contains a JSON tool call (array or single object).
28
+ """
29
+ return "[" in text or "{" in text
30
+
31
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
32
+ """
33
+ Parse JSON tool calls using the base class implementation.
34
+ """
35
+ raise NotImplementedError(
36
+ "Detect and parse not supported for JSON schema constraints."
37
+ )
38
+
39
+ def build_ebnf(self, tools: List[Tool]) -> str:
40
+ """
41
+ Build an EBNF grammar for constrained generation.
42
+ This is not used for JSON schema constraints as they are handled
43
+ by the constraint backends directly.
44
+ """
45
+ raise NotImplementedError(
46
+ "EBNF generation is not supported for JSON schema constraints."
47
+ )
48
+
49
+ def parse_streaming_increment(
50
+ self, new_text: str, tools: List[Tool]
51
+ ) -> StreamingParseResult:
52
+ """
53
+ Streaming incremental parsing with tool validation.
54
+ """
55
+ return super().parse_streaming_increment(new_text, tools)
56
+
57
+ def structure_info(self) -> callable:
58
+ """
59
+ Return a function that creates StructureInfo for constrained generation.
60
+ This is not used for JSON schema constraints as they are handled
61
+ by the constraint backends directly.
62
+ """
63
+ raise NotImplementedError("structure_info not used for JSON schema constraints")
@@ -50,6 +50,11 @@ class KimiK2Detector(BaseFormatDetector):
50
50
 
51
51
  self._last_arguments = ""
52
52
 
53
+ # Robust parser for ids like "functions.search:0" or fallback "search:0"
54
+ self.tool_call_id_regex = re.compile(
55
+ r"^(?:functions\.)?(?P<name>[\w\.]+):(?P<index>\d+)$"
56
+ )
57
+
53
58
  def has_tool_call(self, text: str) -> bool:
54
59
  """Check if the text contains a KimiK2 format tool call."""
55
60
  return self.bot_token in text
@@ -76,14 +81,18 @@ class KimiK2Detector(BaseFormatDetector):
76
81
  tool_calls = []
77
82
  for match in function_call_tuples:
78
83
  function_id, function_args = match
79
- function_name = function_id.split(".")[1].split(":")[0]
80
- function_idx = int(function_id.split(".")[1].split(":")[1])
84
+ m = self.tool_call_id_regex.match(function_id)
85
+ if not m:
86
+ logger.warning("Unexpected tool_call_id format: %s", function_id)
87
+ continue
88
+ function_name = m.group("name")
89
+ function_idx = int(m.group("index"))
81
90
 
82
91
  logger.info(f"function_name {function_name}")
83
92
 
84
93
  tool_calls.append(
85
94
  ToolCallItem(
86
- tool_index=function_idx, # Use the call index in the response, not tool position
95
+ tool_index=function_idx,
87
96
  name=function_name,
88
97
  parameters=function_args,
89
98
  )
@@ -128,7 +137,11 @@ class KimiK2Detector(BaseFormatDetector):
128
137
  function_id = match.group("tool_call_id")
129
138
  function_args = match.group("function_arguments")
130
139
 
131
- function_name = function_id.split(".")[1].split(":")[0]
140
+ m = self.tool_call_id_regex.match(function_id)
141
+ if not m:
142
+ logger.warning("Unexpected tool_call_id format: %s", function_id)
143
+ return StreamingParseResult(normal_text="", calls=calls)
144
+ function_name = m.group("name")
132
145
 
133
146
  # Initialize state if this is the first tool call
134
147
  if self.current_tool_id == -1: