sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. sglang/lang/chat_template.py +21 -0
  2. sglang/srt/_custom_ops.py +29 -1
  3. sglang/srt/configs/internvl.py +3 -0
  4. sglang/srt/configs/model_config.py +5 -1
  5. sglang/srt/constrained/base_grammar_backend.py +10 -2
  6. sglang/srt/constrained/xgrammar_backend.py +7 -5
  7. sglang/srt/conversation.py +17 -2
  8. sglang/srt/debug_utils/__init__.py +0 -0
  9. sglang/srt/debug_utils/dump_comparator.py +131 -0
  10. sglang/srt/debug_utils/dumper.py +108 -0
  11. sglang/srt/debug_utils/text_comparator.py +172 -0
  12. sglang/srt/disaggregation/common/conn.py +34 -6
  13. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
  14. sglang/srt/disaggregation/mini_lb.py +3 -2
  15. sglang/srt/disaggregation/mooncake/conn.py +65 -20
  16. sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
  17. sglang/srt/disaggregation/nixl/conn.py +17 -13
  18. sglang/srt/disaggregation/prefill.py +13 -1
  19. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
  20. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
  21. sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
  22. sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
  23. sglang/srt/distributed/parallel_state.py +70 -15
  24. sglang/srt/entrypoints/engine.py +5 -9
  25. sglang/srt/entrypoints/http_server.py +20 -32
  26. sglang/srt/entrypoints/openai/protocol.py +3 -3
  27. sglang/srt/entrypoints/openai/serving_chat.py +148 -72
  28. sglang/srt/function_call/base_format_detector.py +74 -12
  29. sglang/srt/function_call/deepseekv3_detector.py +26 -11
  30. sglang/srt/function_call/ebnf_composer.py +105 -66
  31. sglang/srt/function_call/function_call_parser.py +6 -4
  32. sglang/srt/function_call/glm4_moe_detector.py +164 -0
  33. sglang/srt/function_call/kimik2_detector.py +41 -16
  34. sglang/srt/function_call/llama32_detector.py +6 -3
  35. sglang/srt/function_call/mistral_detector.py +11 -3
  36. sglang/srt/function_call/pythonic_detector.py +16 -14
  37. sglang/srt/function_call/qwen25_detector.py +12 -3
  38. sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
  39. sglang/srt/layers/activation.py +11 -3
  40. sglang/srt/layers/attention/base_attn_backend.py +3 -1
  41. sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
  42. sglang/srt/layers/attention/vision.py +56 -8
  43. sglang/srt/layers/communicator.py +12 -12
  44. sglang/srt/layers/dp_attention.py +72 -24
  45. sglang/srt/layers/layernorm.py +26 -1
  46. sglang/srt/layers/logits_processor.py +46 -25
  47. sglang/srt/layers/moe/ep_moe/layer.py +172 -206
  48. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  49. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
  51. sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
  52. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
  53. sglang/srt/layers/moe/topk.py +88 -34
  54. sglang/srt/layers/multimodal.py +11 -8
  55. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
  56. sglang/srt/layers/quantization/fp8.py +25 -247
  57. sglang/srt/layers/quantization/fp8_kernel.py +78 -48
  58. sglang/srt/layers/quantization/modelopt_quant.py +33 -14
  59. sglang/srt/layers/quantization/unquant.py +24 -76
  60. sglang/srt/layers/quantization/utils.py +0 -9
  61. sglang/srt/layers/quantization/w4afp8.py +68 -17
  62. sglang/srt/layers/radix_attention.py +5 -3
  63. sglang/srt/lora/lora_manager.py +133 -169
  64. sglang/srt/lora/lora_registry.py +188 -0
  65. sglang/srt/lora/mem_pool.py +2 -2
  66. sglang/srt/managers/cache_controller.py +62 -13
  67. sglang/srt/managers/io_struct.py +19 -1
  68. sglang/srt/managers/mm_utils.py +154 -35
  69. sglang/srt/managers/multimodal_processor.py +3 -14
  70. sglang/srt/managers/schedule_batch.py +27 -11
  71. sglang/srt/managers/scheduler.py +48 -26
  72. sglang/srt/managers/tokenizer_manager.py +62 -28
  73. sglang/srt/managers/tp_worker.py +5 -4
  74. sglang/srt/mem_cache/allocator.py +67 -7
  75. sglang/srt/mem_cache/hicache_storage.py +17 -1
  76. sglang/srt/mem_cache/hiradix_cache.py +35 -18
  77. sglang/srt/mem_cache/memory_pool_host.py +3 -0
  78. sglang/srt/model_executor/cuda_graph_runner.py +61 -25
  79. sglang/srt/model_executor/forward_batch_info.py +201 -29
  80. sglang/srt/model_executor/model_runner.py +109 -37
  81. sglang/srt/models/deepseek_v2.py +63 -30
  82. sglang/srt/models/glm4_moe.py +1035 -0
  83. sglang/srt/models/glm4_moe_nextn.py +167 -0
  84. sglang/srt/models/interns1.py +328 -0
  85. sglang/srt/models/internvl.py +143 -47
  86. sglang/srt/models/llava.py +9 -5
  87. sglang/srt/models/minicpmo.py +4 -1
  88. sglang/srt/models/mllama4.py +10 -3
  89. sglang/srt/models/qwen2_moe.py +2 -6
  90. sglang/srt/models/qwen3_moe.py +6 -8
  91. sglang/srt/multimodal/processors/base_processor.py +20 -6
  92. sglang/srt/multimodal/processors/clip.py +2 -2
  93. sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
  94. sglang/srt/multimodal/processors/gemma3.py +2 -2
  95. sglang/srt/multimodal/processors/gemma3n.py +2 -2
  96. sglang/srt/multimodal/processors/internvl.py +21 -8
  97. sglang/srt/multimodal/processors/janus_pro.py +2 -2
  98. sglang/srt/multimodal/processors/kimi_vl.py +2 -2
  99. sglang/srt/multimodal/processors/llava.py +4 -4
  100. sglang/srt/multimodal/processors/minicpm.py +2 -3
  101. sglang/srt/multimodal/processors/mlama.py +2 -2
  102. sglang/srt/multimodal/processors/mllama4.py +18 -111
  103. sglang/srt/multimodal/processors/phi4mm.py +2 -2
  104. sglang/srt/multimodal/processors/pixtral.py +2 -2
  105. sglang/srt/multimodal/processors/qwen_audio.py +2 -2
  106. sglang/srt/multimodal/processors/qwen_vl.py +2 -2
  107. sglang/srt/multimodal/processors/vila.py +3 -1
  108. sglang/srt/reasoning_parser.py +48 -5
  109. sglang/srt/sampling/sampling_batch_info.py +6 -5
  110. sglang/srt/server_args.py +132 -60
  111. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
  112. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
  113. sglang/srt/speculative/eagle_utils.py +51 -23
  114. sglang/srt/speculative/eagle_worker.py +59 -44
  115. sglang/srt/two_batch_overlap.py +9 -5
  116. sglang/srt/utils.py +113 -69
  117. sglang/srt/weight_sync/utils.py +119 -0
  118. sglang/test/runners.py +4 -0
  119. sglang/test/test_activation.py +50 -1
  120. sglang/test/test_utils.py +65 -5
  121. sglang/utils.py +19 -0
  122. sglang/version.py +1 -1
  123. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
  124. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
  125. sglang/srt/debug_utils.py +0 -74
  126. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
  127. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
  128. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -107,6 +107,8 @@ from sglang.version import __version__
107
107
  logger = logging.getLogger(__name__)
108
108
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
109
109
 
110
+ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
111
+
110
112
 
111
113
  # Store global states
112
114
  @dataclasses.dataclass
@@ -212,9 +214,6 @@ async def validate_json_request(raw_request: Request):
212
214
  )
213
215
 
214
216
 
215
- HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
216
-
217
-
218
217
  ##### Native API endpoints #####
219
218
 
220
219
 
@@ -807,6 +806,24 @@ async def retrieve_model(model: str):
807
806
  )
808
807
 
809
808
 
809
+ @app.post("/v1/score", dependencies=[Depends(validate_json_request)])
810
+ async def v1_score_request(request: ScoringRequest, raw_request: Request):
811
+ """Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
812
+ return await raw_request.app.state.openai_serving_score.handle_request(
813
+ request, raw_request
814
+ )
815
+
816
+
817
+ @app.api_route(
818
+ "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
819
+ )
820
+ async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
821
+ """Endpoint for reranking documents based on query relevance."""
822
+ return await raw_request.app.state.openai_serving_rerank.handle_request(
823
+ request, raw_request
824
+ )
825
+
826
+
810
827
  ## SageMaker API
811
828
  @app.get("/ping")
812
829
  async def sagemaker_health() -> Response:
@@ -852,24 +869,6 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
852
869
  return ORJSONResponse({"predictions": ret})
853
870
 
854
871
 
855
- @app.post("/v1/score", dependencies=[Depends(validate_json_request)])
856
- async def v1_score_request(request: ScoringRequest, raw_request: Request):
857
- """Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
858
- return await raw_request.app.state.openai_serving_score.handle_request(
859
- request, raw_request
860
- )
861
-
862
-
863
- @app.api_route(
864
- "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
865
- )
866
- async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
867
- """Endpoint for reranking documents based on query relevance."""
868
- return await raw_request.app.state.openai_serving_rerank.handle_request(
869
- request, raw_request
870
- )
871
-
872
-
873
872
  def _create_error_response(e):
874
873
  return ORJSONResponse(
875
874
  {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
@@ -916,15 +915,6 @@ def launch_server(
916
915
  add_prometheus_middleware(app)
917
916
  enable_func_timer()
918
917
 
919
- image_token_text = None
920
- if (
921
- tokenizer_manager.image_token_id is not None
922
- and not server_args.skip_tokenizer_init
923
- ):
924
- image_token_text = tokenizer_manager.tokenizer.decode(
925
- [tokenizer_manager.image_token_id]
926
- )
927
-
928
918
  # Send a warmup request - we will create the thread launch it
929
919
  # in the lifespan after all other warmups have fired.
930
920
  warmup_thread = threading.Thread(
@@ -932,7 +922,6 @@ def launch_server(
932
922
  args=(
933
923
  server_args,
934
924
  pipe_finish_writer,
935
- image_token_text,
936
925
  launch_callback,
937
926
  ),
938
927
  )
@@ -1066,7 +1055,6 @@ def _execute_server_warmup(
1066
1055
  def _wait_and_warmup(
1067
1056
  server_args: ServerArgs,
1068
1057
  pipe_finish_writer: Optional[multiprocessing.connection.Connection],
1069
- image_token_text: str,
1070
1058
  launch_callback: Optional[Callable[[], None]] = None,
1071
1059
  ):
1072
1060
  if not server_args.skip_server_warmup:
@@ -192,9 +192,9 @@ class CompletionRequest(BaseModel):
192
192
  session_params: Optional[Dict] = None
193
193
 
194
194
  # For PD disaggregation
195
- bootstrap_host: Optional[str] = None
196
- bootstrap_port: Optional[int] = None
197
- bootstrap_room: Optional[int] = None
195
+ bootstrap_host: Optional[Union[List[str], str]] = None
196
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
197
+ bootstrap_room: Optional[Union[List[int], int]] = None
198
198
 
199
199
  # For request id
200
200
  rid: Optional[Union[List[str], str]] = None
@@ -55,6 +55,20 @@ class OpenAIServingChat(OpenAIServingBase):
55
55
  def _request_id_prefix(self) -> str:
56
56
  return "chatcmpl-"
57
57
 
58
+ def _validate_request(self, request: ChatCompletionRequest) -> Optional[str]:
59
+ """Validate that the input is valid."""
60
+ if not request.messages:
61
+ return "Messages cannot be empty."
62
+
63
+ if (
64
+ isinstance(request.tool_choice, str)
65
+ and request.tool_choice.lower() == "required"
66
+ and not request.tools
67
+ ):
68
+ return "Tools cannot be empty if tool choice is set to required."
69
+
70
+ return None
71
+
58
72
  def _convert_to_internal_request(
59
73
  self,
60
74
  request: ChatCompletionRequest,
@@ -398,6 +412,8 @@ class OpenAIServingChat(OpenAIServingBase):
398
412
  is_firsts = {}
399
413
  stream_buffers = {}
400
414
  n_prev_tokens = {}
415
+ has_tool_calls = {}
416
+ finish_reasons = {}
401
417
 
402
418
  # Usage tracking
403
419
  prompt_tokens = {}
@@ -429,6 +445,10 @@ class OpenAIServingChat(OpenAIServingBase):
429
445
  finish_reason = content["meta_info"]["finish_reason"]
430
446
  finish_reason_type = finish_reason["type"] if finish_reason else None
431
447
 
448
+ # Track finish_reason for each index
449
+ if finish_reason_type:
450
+ finish_reasons[index] = finish_reason
451
+
432
452
  # First chunk with role
433
453
  if is_firsts.get(index, True):
434
454
  is_firsts[index] = False
@@ -436,13 +456,8 @@ class OpenAIServingChat(OpenAIServingBase):
436
456
  choice_data = ChatCompletionResponseStreamChoice(
437
457
  index=index,
438
458
  delta=delta,
439
- finish_reason=finish_reason_type,
440
- matched_stop=(
441
- finish_reason["matched"]
442
- if finish_reason and "matched" in finish_reason
443
- else None
444
- ),
445
- logprobs=choice_logprobs,
459
+ finish_reason=None,
460
+ logprobs=None,
446
461
  )
447
462
  chunk = ChatCompletionStreamResponse(
448
463
  id=content["meta_info"]["id"],
@@ -469,7 +484,7 @@ class OpenAIServingChat(OpenAIServingBase):
469
484
  choice_data = ChatCompletionResponseStreamChoice(
470
485
  index=index,
471
486
  delta=DeltaMessage(reasoning_content=reasoning_text),
472
- finish_reason=finish_reason_type,
487
+ finish_reason=None,
473
488
  )
474
489
  chunk = ChatCompletionStreamResponse(
475
490
  id=content["meta_info"]["id"],
@@ -479,9 +494,6 @@ class OpenAIServingChat(OpenAIServingBase):
479
494
  )
480
495
  yield f"data: {chunk.model_dump_json()}\n\n"
481
496
 
482
- if not delta:
483
- continue
484
-
485
497
  # Handle tool calls
486
498
  if request.tool_choice != "none" and request.tools:
487
499
  async for chunk in self._process_tool_call_stream(
@@ -490,28 +502,28 @@ class OpenAIServingChat(OpenAIServingBase):
490
502
  parser_dict,
491
503
  content,
492
504
  request,
493
- finish_reason_type,
505
+ has_tool_calls,
494
506
  ):
495
- yield chunk
507
+ if chunk:
508
+ yield chunk
509
+
510
+ # Send any remaining tool call arguments when generation finishes
511
+ if finish_reason_type is not None and index in parser_dict:
512
+ parser = parser_dict[index]
513
+ remaining_chunk = self._check_for_unstreamed_tool_args(
514
+ parser, content, request, index
515
+ )
516
+ if remaining_chunk:
517
+ yield remaining_chunk
518
+
496
519
  else:
497
520
  # Regular content
498
- if delta or not (
499
- request.stream_options and request.stream_options.include_usage
500
- ):
521
+ if delta:
501
522
  choice_data = ChatCompletionResponseStreamChoice(
502
523
  index=index,
503
524
  delta=DeltaMessage(content=delta if delta else None),
504
- finish_reason=(
505
- None
506
- if request.stream_options
507
- and request.stream_options.include_usage
508
- else finish_reason_type
509
- ),
510
- matched_stop=(
511
- finish_reason["matched"]
512
- if finish_reason and "matched" in finish_reason
513
- else None
514
- ),
525
+ finish_reason=None,
526
+ matched_stop=None,
515
527
  logprobs=choice_logprobs,
516
528
  )
517
529
  chunk = ChatCompletionStreamResponse(
@@ -522,26 +534,36 @@ class OpenAIServingChat(OpenAIServingBase):
522
534
  )
523
535
  yield f"data: {chunk.model_dump_json()}\n\n"
524
536
 
525
- # Final chunk with finish_reason
526
- finish_reason_chunk = ChatCompletionStreamResponse(
527
- id=content["meta_info"]["id"],
528
- created=int(time.time()),
529
- choices=[
530
- ChatCompletionResponseStreamChoice(
531
- index=index,
532
- delta=DeltaMessage(),
533
- finish_reason=finish_reason_type,
534
- matched_stop=(
535
- finish_reason["matched"]
536
- if finish_reason and "matched" in finish_reason
537
- else None
538
- ),
539
- )
540
- ],
541
- model=request.model,
542
- usage=None,
543
- )
544
- yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
537
+ # Send finish_reason chunks for each index that completed
538
+ for idx, finish_reason_data in finish_reasons.items():
539
+ finish_reason_type = finish_reason_data["type"]
540
+
541
+ # Change finish_reason to "tool_calls" if we had tool calls and stopped naturally
542
+ final_finish_reason = finish_reason_type
543
+ if has_tool_calls.get(idx, False) and finish_reason_type == "stop":
544
+ final_finish_reason = "tool_calls"
545
+
546
+ finish_reason_chunk = ChatCompletionStreamResponse(
547
+ id=content["meta_info"][
548
+ "id"
549
+ ], # NOTE: openai uses the same chatcmpl-id for all indices
550
+ created=int(time.time()),
551
+ choices=[
552
+ ChatCompletionResponseStreamChoice(
553
+ index=idx,
554
+ delta=DeltaMessage(),
555
+ finish_reason=final_finish_reason,
556
+ matched_stop=(
557
+ finish_reason_data["matched"]
558
+ if "matched" in finish_reason_data
559
+ else None
560
+ ),
561
+ )
562
+ ],
563
+ model=request.model,
564
+ usage=None,
565
+ )
566
+ yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
545
567
 
546
568
  # Send hidden states if requested
547
569
  if request.return_hidden_states and hidden_states:
@@ -561,7 +583,7 @@ class OpenAIServingChat(OpenAIServingBase):
561
583
  delta=DeltaMessage(
562
584
  hidden_states=last_token_hidden_states
563
585
  ),
564
- finish_reason=finish_reason_type,
586
+ finish_reason=None, # Hidden states don't need finish_reason
565
587
  )
566
588
  ],
567
589
  model=request.model,
@@ -840,7 +862,7 @@ class OpenAIServingChat(OpenAIServingBase):
840
862
  parser_dict: Dict[int, FunctionCallParser],
841
863
  content: Dict[str, Any],
842
864
  request: ChatCompletionRequest,
843
- finish_reason_type: Optional[str],
865
+ has_tool_calls: Dict[int, bool],
844
866
  ):
845
867
  """Process tool calls in streaming response"""
846
868
  if index not in parser_dict:
@@ -857,7 +879,7 @@ class OpenAIServingChat(OpenAIServingBase):
857
879
  choice_data = ChatCompletionResponseStreamChoice(
858
880
  index=index,
859
881
  delta=DeltaMessage(content=normal_text),
860
- finish_reason=finish_reason_type,
882
+ finish_reason=None,
861
883
  )
862
884
  chunk = ChatCompletionStreamResponse(
863
885
  id=content["meta_info"]["id"],
@@ -869,6 +891,9 @@ class OpenAIServingChat(OpenAIServingBase):
869
891
 
870
892
  # Yield tool calls
871
893
  for call_item in calls:
894
+ # Mark that this choice has tool calls
895
+ has_tool_calls[index] = True
896
+
872
897
  # Tool call ID should be generated only once per tool call
873
898
  if call_item.name:
874
899
  # First chunk: include ID and function name
@@ -879,23 +904,6 @@ class OpenAIServingChat(OpenAIServingBase):
879
904
  tool_call_id = None
880
905
  function_name = None
881
906
 
882
- if finish_reason_type == "stop":
883
- # Handle remaining arguments
884
- latest_delta_len = 0
885
- if isinstance(call_item.parameters, str):
886
- latest_delta_len = len(call_item.parameters)
887
-
888
- expected_call = json.dumps(
889
- parser.detector.prev_tool_call_arr[index].get("arguments", {}),
890
- ensure_ascii=False,
891
- )
892
- actual_call = parser.detector.streamed_args_for_tool[index]
893
- if latest_delta_len > 0:
894
- actual_call = actual_call[:-latest_delta_len]
895
- remaining_call = expected_call.replace(actual_call, "", 1)
896
- call_item.parameters = remaining_call
897
- finish_reason_type = "tool_calls"
898
-
899
907
  tool_call = ToolCall(
900
908
  id=tool_call_id,
901
909
  index=call_item.tool_index,
@@ -908,11 +916,7 @@ class OpenAIServingChat(OpenAIServingBase):
908
916
  choice_data = ChatCompletionResponseStreamChoice(
909
917
  index=index,
910
918
  delta=DeltaMessage(tool_calls=[tool_call]),
911
- finish_reason=(
912
- None
913
- if request.stream_options and request.stream_options.include_usage
914
- else finish_reason_type
915
- ),
919
+ finish_reason=None,
916
920
  )
917
921
  chunk = ChatCompletionStreamResponse(
918
922
  id=content["meta_info"]["id"],
@@ -921,3 +925,75 @@ class OpenAIServingChat(OpenAIServingBase):
921
925
  model=request.model,
922
926
  )
923
927
  yield f"data: {chunk.model_dump_json()}\n\n"
928
+
929
+ def _check_for_unstreamed_tool_args(
930
+ self,
931
+ parser: FunctionCallParser,
932
+ content: Dict[str, Any],
933
+ request: ChatCompletionRequest,
934
+ index: int,
935
+ ) -> Optional[str]:
936
+ """
937
+ Check for any remaining tool call arguments that need to be streamed
938
+ when generation finishes. This ensures tool calls are properly completed
939
+ even if the model generates the final arguments in the last chunk.
940
+ """
941
+ # Only check if we have tool calls and the parser has tracked data
942
+ if (
943
+ not hasattr(parser.detector, "prev_tool_call_arr")
944
+ or not parser.detector.prev_tool_call_arr
945
+ ):
946
+ return None
947
+
948
+ if (
949
+ not hasattr(parser.detector, "streamed_args_for_tool")
950
+ or not parser.detector.streamed_args_for_tool
951
+ ):
952
+ return None
953
+
954
+ # Get the last tool call that was being processed
955
+ tool_index = len(parser.detector.prev_tool_call_arr) - 1
956
+ if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
957
+ return None
958
+
959
+ # Get expected vs actual arguments
960
+ expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
961
+ "arguments", {}
962
+ )
963
+ expected_call = json.dumps(expected_args, ensure_ascii=False)
964
+ actual_call = parser.detector.streamed_args_for_tool[tool_index]
965
+
966
+ # Check if there are remaining arguments to send
967
+ remaining_call = (
968
+ expected_call.replace(actual_call, "", 1)
969
+ if actual_call in expected_call
970
+ else ""
971
+ )
972
+
973
+ if remaining_call:
974
+ # Create tool call chunk with remaining arguments
975
+ tool_call = ToolCall(
976
+ id=None, # No ID for argument deltas
977
+ index=tool_index,
978
+ function=FunctionResponse(
979
+ name=None, # No name for argument deltas
980
+ arguments=remaining_call,
981
+ ),
982
+ )
983
+
984
+ choice_data = ChatCompletionResponseStreamChoice(
985
+ index=index,
986
+ delta=DeltaMessage(tool_calls=[tool_call]),
987
+ finish_reason=None, # Don't send finish_reason with this chunk
988
+ )
989
+
990
+ chunk = ChatCompletionStreamResponse(
991
+ id=content["meta_info"]["id"],
992
+ created=int(time.time()),
993
+ choices=[choice_data],
994
+ model=request.model,
995
+ )
996
+
997
+ return f"data: {chunk.model_dump_json()}\n\n"
998
+
999
+ return None
@@ -25,23 +25,49 @@ class BaseFormatDetector(ABC):
25
25
  """Base class providing two sets of interfaces: one-time and streaming incremental."""
26
26
 
27
27
  def __init__(self):
28
- # initialize properties used for state when parsing tool calls in
28
+ # Streaming state management
29
+ # Buffer for accumulating incomplete patterns that arrive across multiple streaming chunks
29
30
  self._buffer = ""
30
- # streaming mode
31
+ # Stores complete tool call info (name and arguments) for each tool being parsed.
32
+ # Used by serving layer for completion handling when streaming ends.
33
+ # Format: [{"name": str, "arguments": dict}, ...]
31
34
  self.prev_tool_call_arr: List[Dict] = []
35
+ # Index of currently streaming tool call. Starts at -1 (no active tool),
36
+ # increments as each tool completes. Tracks which tool's arguments are streaming.
32
37
  self.current_tool_id: int = -1
38
+ # Flag for whether current tool's name has been sent to client.
39
+ # Tool names sent first with empty parameters, then arguments stream incrementally.
33
40
  self.current_tool_name_sent: bool = False
34
- self.streamed_args_for_tool: List[str] = (
35
- []
36
- ) # map what has been streamed for each tool so far to a list
41
+ # Tracks raw JSON string content streamed to client for each tool's arguments.
42
+ # Critical for serving layer to calculate remaining content when streaming ends.
43
+ # Each index corresponds to a tool_id. Example: ['{"location": "San Francisco"', '{"temp": 72']
44
+ self.streamed_args_for_tool: List[str] = []
45
+
46
+ # Token configuration (override in subclasses)
37
47
  self.bot_token = ""
38
48
  self.eot_token = ""
39
49
  self.tool_call_separator = ", "
40
50
 
41
- def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
42
- tool_indices = {
51
+ def _get_tool_indices(self, tools: List[Tool]) -> Dict[str, int]:
52
+ """
53
+ Get a mapping of tool names to their indices in the tools list.
54
+
55
+ This utility method creates a dictionary mapping function names to their
56
+ indices in the tools list, which is commonly needed for tool validation
57
+ and ToolCallItem creation.
58
+
59
+ Args:
60
+ tools: List of available tools
61
+
62
+ Returns:
63
+ Dictionary mapping tool names to their indices
64
+ """
65
+ return {
43
66
  tool.function.name: i for i, tool in enumerate(tools) if tool.function.name
44
67
  }
68
+
69
+ def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
70
+ tool_indices = self._get_tool_indices(tools)
45
71
  if not isinstance(action, list):
46
72
  action = [action]
47
73
 
@@ -130,11 +156,7 @@ class BaseFormatDetector(ABC):
130
156
 
131
157
  # Build tool indices if not already built
132
158
  if not hasattr(self, "_tool_indices"):
133
- self._tool_indices = {
134
- tool.function.name: i
135
- for i, tool in enumerate(tools)
136
- if tool.function and tool.function.name
137
- }
159
+ self._tool_indices = self._get_tool_indices(tools)
138
160
 
139
161
  flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
140
162
 
@@ -294,12 +316,52 @@ class BaseFormatDetector(ABC):
294
316
 
295
317
  @abstractmethod
296
318
  def has_tool_call(self, text: str) -> bool:
319
+ """
320
+ Check if the given text contains function call markers specific to this format.
321
+ """
297
322
  raise NotImplementedError()
298
323
 
324
+ def supports_structural_tag(self) -> bool:
325
+ """Return True if this detector supports structural tag format."""
326
+ return True
327
+
299
328
  @abstractmethod
300
329
  def structure_info(self) -> _GetInfoFunc:
330
+ """
331
+ Return a function that creates StructureInfo for constrained generation.
332
+
333
+ The returned function takes a tool name and returns a StructureInfo object
334
+ containing the begin/end patterns and trigger tokens needed for constrained
335
+ generation of function calls in this format.
336
+
337
+ Returns:
338
+ A function that takes a tool name (str) and returns StructureInfo
339
+ """
301
340
  raise NotImplementedError()
302
341
 
303
342
  @abstractmethod
304
343
  def build_ebnf(self, tools: List[Tool]) -> str:
344
+ """
345
+ Build an EBNF grammar for constrained generation of function calls.
346
+
347
+ This method generates an Extended Backus-Naur Form (EBNF) grammar that
348
+ constrains the model's output to valid function calls in this format.
349
+ The grammar should include all available tools and their parameter schemas.
350
+
351
+ Args:
352
+ tools: List of available tools/functions that can be called
353
+
354
+ Returns:
355
+ A string containing the EBNF grammar for this function call format
356
+
357
+ The EBNF grammar should:
358
+ - Define the overall structure of function calls in this format
359
+ - Include all tool names from the provided tools list
360
+ - Define valid JSON structures for function arguments
361
+ - Handle multiple function calls if the format supports them
362
+
363
+ Note:
364
+ Most implementations use EBNFComposer.build_ebnf() utility with
365
+ format-specific parameters rather than writing EBNF from scratch.
366
+ """
305
367
  raise NotImplementedError()
@@ -19,9 +19,28 @@ logger = logging.getLogger(__name__)
19
19
 
20
20
  class DeepSeekV3Detector(BaseFormatDetector):
21
21
  """
22
- Detector for DeepSeek models.
23
- Assumes function call format:
24
- '<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Tokyo"}\n```<|tool▁call▁end|>\n<|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Paris"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>
22
+ Detector for DeepSeek V3 model function call format.
23
+
24
+ The DeepSeek V3 format uses special Unicode tokens to delimit function calls
25
+ with JSON code blocks for arguments.
26
+
27
+ Format Structure:
28
+ ```
29
+ <|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>{function_name}\n```json\n{json_arguments}\n```<|tool▁calls▁end|><|end▁of▁sentence|>
30
+ ```
31
+ Examples:
32
+ ```
33
+ <|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Tokyo"}\n```<|tool▁call▁end|>\n<|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Paris"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>
34
+ ```
35
+
36
+ Key Components:
37
+ - Tool Calls Section: Wrapped between `<|tool▁calls▁begin|>` and `<|tool▁calls▁end|>`
38
+ - Individual Tool Call: Wrapped between `<|tool▁call▁begin|>` and `<|tool▁call▁end|>`
39
+ - Function Declaration: `function<|tool▁sep|>{function_name}`
40
+ - Arguments: JSON code block between ````json` and ````
41
+ - Supports multiple tool calls
42
+
43
+ Reference: https://huggingface.co/deepseek-ai/DeepSeek-V3-0324?chat_template=default
25
44
  """
26
45
 
27
46
  def __init__(self):
@@ -89,16 +108,12 @@ class DeepSeekV3Detector(BaseFormatDetector):
89
108
  return StreamingParseResult(normal_text=new_text)
90
109
 
91
110
  if not hasattr(self, "_tool_indices"):
92
- self._tool_indices = {
93
- tool.function.name: i
94
- for i, tool in enumerate(tools)
95
- if tool.function and tool.function.name
96
- }
111
+ self._tool_indices = self._get_tool_indices(tools)
97
112
 
98
113
  calls: list[ToolCallItem] = []
99
114
  try:
100
115
  partial_match = re.search(
101
- pattern=r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)",
116
+ pattern=r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)\n```.*",
102
117
  string=current_text,
103
118
  flags=re.DOTALL,
104
119
  )
@@ -127,7 +142,7 @@ class DeepSeekV3Detector(BaseFormatDetector):
127
142
  )
128
143
  )
129
144
  self.current_tool_name_sent = True
130
- # Store the tool call info for adapter.py
145
+ # Store the tool call info for serving layer completions endpoint
131
146
  self.prev_tool_call_arr[self.current_tool_id] = {
132
147
  "name": func_name,
133
148
  "arguments": {},
@@ -153,7 +168,7 @@ class DeepSeekV3Detector(BaseFormatDetector):
153
168
  ] += argument_diff
154
169
 
155
170
  if _is_complete_json(func_args_raw):
156
- # Update the stored arguments for adapter.py
171
+ # Update the stored arguments
157
172
  try:
158
173
  parsed_args = json.loads(func_args_raw)
159
174
  self.prev_tool_call_arr[self.current_tool_id][