sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/chat_template.py +21 -0
- sglang/srt/_custom_ops.py +29 -1
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/model_config.py +5 -1
- sglang/srt/constrained/base_grammar_backend.py +10 -2
- sglang/srt/constrained/xgrammar_backend.py +7 -5
- sglang/srt/conversation.py +17 -2
- sglang/srt/debug_utils/__init__.py +0 -0
- sglang/srt/debug_utils/dump_comparator.py +131 -0
- sglang/srt/debug_utils/dumper.py +108 -0
- sglang/srt/debug_utils/text_comparator.py +172 -0
- sglang/srt/disaggregation/common/conn.py +34 -6
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
- sglang/srt/disaggregation/mini_lb.py +3 -2
- sglang/srt/disaggregation/mooncake/conn.py +65 -20
- sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
- sglang/srt/disaggregation/nixl/conn.py +17 -13
- sglang/srt/disaggregation/prefill.py +13 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
- sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
- sglang/srt/distributed/parallel_state.py +70 -15
- sglang/srt/entrypoints/engine.py +5 -9
- sglang/srt/entrypoints/http_server.py +20 -32
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +148 -72
- sglang/srt/function_call/base_format_detector.py +74 -12
- sglang/srt/function_call/deepseekv3_detector.py +26 -11
- sglang/srt/function_call/ebnf_composer.py +105 -66
- sglang/srt/function_call/function_call_parser.py +6 -4
- sglang/srt/function_call/glm4_moe_detector.py +164 -0
- sglang/srt/function_call/kimik2_detector.py +41 -16
- sglang/srt/function_call/llama32_detector.py +6 -3
- sglang/srt/function_call/mistral_detector.py +11 -3
- sglang/srt/function_call/pythonic_detector.py +16 -14
- sglang/srt/function_call/qwen25_detector.py +12 -3
- sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +11 -9
- sglang/srt/layers/activation.py +11 -3
- sglang/srt/layers/attention/base_attn_backend.py +3 -1
- sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
- sglang/srt/layers/attention/vision.py +56 -8
- sglang/srt/layers/communicator.py +12 -12
- sglang/srt/layers/dp_attention.py +72 -24
- sglang/srt/layers/layernorm.py +26 -1
- sglang/srt/layers/logits_processor.py +46 -25
- sglang/srt/layers/moe/ep_moe/layer.py +172 -206
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
- sglang/srt/layers/moe/topk.py +88 -34
- sglang/srt/layers/multimodal.py +11 -8
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
- sglang/srt/layers/quantization/fp8.py +25 -247
- sglang/srt/layers/quantization/fp8_kernel.py +78 -48
- sglang/srt/layers/quantization/modelopt_quant.py +33 -14
- sglang/srt/layers/quantization/unquant.py +24 -76
- sglang/srt/layers/quantization/utils.py +0 -9
- sglang/srt/layers/quantization/w4afp8.py +68 -17
- sglang/srt/layers/radix_attention.py +5 -3
- sglang/srt/lora/lora_manager.py +133 -169
- sglang/srt/lora/lora_registry.py +188 -0
- sglang/srt/lora/mem_pool.py +2 -2
- sglang/srt/managers/cache_controller.py +62 -13
- sglang/srt/managers/io_struct.py +19 -1
- sglang/srt/managers/mm_utils.py +154 -35
- sglang/srt/managers/multimodal_processor.py +3 -14
- sglang/srt/managers/schedule_batch.py +27 -11
- sglang/srt/managers/scheduler.py +48 -26
- sglang/srt/managers/tokenizer_manager.py +62 -28
- sglang/srt/managers/tp_worker.py +5 -4
- sglang/srt/mem_cache/allocator.py +67 -7
- sglang/srt/mem_cache/hicache_storage.py +17 -1
- sglang/srt/mem_cache/hiradix_cache.py +35 -18
- sglang/srt/mem_cache/memory_pool_host.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +61 -25
- sglang/srt/model_executor/forward_batch_info.py +201 -29
- sglang/srt/model_executor/model_runner.py +109 -37
- sglang/srt/models/deepseek_v2.py +63 -30
- sglang/srt/models/glm4_moe.py +1035 -0
- sglang/srt/models/glm4_moe_nextn.py +167 -0
- sglang/srt/models/interns1.py +328 -0
- sglang/srt/models/internvl.py +143 -47
- sglang/srt/models/llava.py +9 -5
- sglang/srt/models/minicpmo.py +4 -1
- sglang/srt/models/mllama4.py +10 -3
- sglang/srt/models/qwen2_moe.py +2 -6
- sglang/srt/models/qwen3_moe.py +6 -8
- sglang/srt/multimodal/processors/base_processor.py +20 -6
- sglang/srt/multimodal/processors/clip.py +2 -2
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
- sglang/srt/multimodal/processors/gemma3.py +2 -2
- sglang/srt/multimodal/processors/gemma3n.py +2 -2
- sglang/srt/multimodal/processors/internvl.py +21 -8
- sglang/srt/multimodal/processors/janus_pro.py +2 -2
- sglang/srt/multimodal/processors/kimi_vl.py +2 -2
- sglang/srt/multimodal/processors/llava.py +4 -4
- sglang/srt/multimodal/processors/minicpm.py +2 -3
- sglang/srt/multimodal/processors/mlama.py +2 -2
- sglang/srt/multimodal/processors/mllama4.py +18 -111
- sglang/srt/multimodal/processors/phi4mm.py +2 -2
- sglang/srt/multimodal/processors/pixtral.py +2 -2
- sglang/srt/multimodal/processors/qwen_audio.py +2 -2
- sglang/srt/multimodal/processors/qwen_vl.py +2 -2
- sglang/srt/multimodal/processors/vila.py +3 -1
- sglang/srt/reasoning_parser.py +48 -5
- sglang/srt/sampling/sampling_batch_info.py +6 -5
- sglang/srt/server_args.py +132 -60
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
- sglang/srt/speculative/eagle_utils.py +51 -23
- sglang/srt/speculative/eagle_worker.py +59 -44
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils.py +113 -69
- sglang/srt/weight_sync/utils.py +119 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_activation.py +50 -1
- sglang/test/test_utils.py +65 -5
- sglang/utils.py +19 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +6 -6
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +127 -114
- sglang/srt/debug_utils.py +0 -74
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0
@@ -107,6 +107,8 @@ from sglang.version import __version__
|
|
107
107
|
logger = logging.getLogger(__name__)
|
108
108
|
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
109
109
|
|
110
|
+
HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
111
|
+
|
110
112
|
|
111
113
|
# Store global states
|
112
114
|
@dataclasses.dataclass
|
@@ -212,9 +214,6 @@ async def validate_json_request(raw_request: Request):
|
|
212
214
|
)
|
213
215
|
|
214
216
|
|
215
|
-
HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
|
216
|
-
|
217
|
-
|
218
217
|
##### Native API endpoints #####
|
219
218
|
|
220
219
|
|
@@ -807,6 +806,24 @@ async def retrieve_model(model: str):
|
|
807
806
|
)
|
808
807
|
|
809
808
|
|
809
|
+
@app.post("/v1/score", dependencies=[Depends(validate_json_request)])
|
810
|
+
async def v1_score_request(request: ScoringRequest, raw_request: Request):
|
811
|
+
"""Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
|
812
|
+
return await raw_request.app.state.openai_serving_score.handle_request(
|
813
|
+
request, raw_request
|
814
|
+
)
|
815
|
+
|
816
|
+
|
817
|
+
@app.api_route(
|
818
|
+
"/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
|
819
|
+
)
|
820
|
+
async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
|
821
|
+
"""Endpoint for reranking documents based on query relevance."""
|
822
|
+
return await raw_request.app.state.openai_serving_rerank.handle_request(
|
823
|
+
request, raw_request
|
824
|
+
)
|
825
|
+
|
826
|
+
|
810
827
|
## SageMaker API
|
811
828
|
@app.get("/ping")
|
812
829
|
async def sagemaker_health() -> Response:
|
@@ -852,24 +869,6 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
|
|
852
869
|
return ORJSONResponse({"predictions": ret})
|
853
870
|
|
854
871
|
|
855
|
-
@app.post("/v1/score", dependencies=[Depends(validate_json_request)])
|
856
|
-
async def v1_score_request(request: ScoringRequest, raw_request: Request):
|
857
|
-
"""Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
|
858
|
-
return await raw_request.app.state.openai_serving_score.handle_request(
|
859
|
-
request, raw_request
|
860
|
-
)
|
861
|
-
|
862
|
-
|
863
|
-
@app.api_route(
|
864
|
-
"/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
|
865
|
-
)
|
866
|
-
async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
|
867
|
-
"""Endpoint for reranking documents based on query relevance."""
|
868
|
-
return await raw_request.app.state.openai_serving_rerank.handle_request(
|
869
|
-
request, raw_request
|
870
|
-
)
|
871
|
-
|
872
|
-
|
873
872
|
def _create_error_response(e):
|
874
873
|
return ORJSONResponse(
|
875
874
|
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
@@ -916,15 +915,6 @@ def launch_server(
|
|
916
915
|
add_prometheus_middleware(app)
|
917
916
|
enable_func_timer()
|
918
917
|
|
919
|
-
image_token_text = None
|
920
|
-
if (
|
921
|
-
tokenizer_manager.image_token_id is not None
|
922
|
-
and not server_args.skip_tokenizer_init
|
923
|
-
):
|
924
|
-
image_token_text = tokenizer_manager.tokenizer.decode(
|
925
|
-
[tokenizer_manager.image_token_id]
|
926
|
-
)
|
927
|
-
|
928
918
|
# Send a warmup request - we will create the thread launch it
|
929
919
|
# in the lifespan after all other warmups have fired.
|
930
920
|
warmup_thread = threading.Thread(
|
@@ -932,7 +922,6 @@ def launch_server(
|
|
932
922
|
args=(
|
933
923
|
server_args,
|
934
924
|
pipe_finish_writer,
|
935
|
-
image_token_text,
|
936
925
|
launch_callback,
|
937
926
|
),
|
938
927
|
)
|
@@ -1066,7 +1055,6 @@ def _execute_server_warmup(
|
|
1066
1055
|
def _wait_and_warmup(
|
1067
1056
|
server_args: ServerArgs,
|
1068
1057
|
pipe_finish_writer: Optional[multiprocessing.connection.Connection],
|
1069
|
-
image_token_text: str,
|
1070
1058
|
launch_callback: Optional[Callable[[], None]] = None,
|
1071
1059
|
):
|
1072
1060
|
if not server_args.skip_server_warmup:
|
@@ -192,9 +192,9 @@ class CompletionRequest(BaseModel):
|
|
192
192
|
session_params: Optional[Dict] = None
|
193
193
|
|
194
194
|
# For PD disaggregation
|
195
|
-
bootstrap_host: Optional[str] = None
|
196
|
-
bootstrap_port: Optional[int] = None
|
197
|
-
bootstrap_room: Optional[int] = None
|
195
|
+
bootstrap_host: Optional[Union[List[str], str]] = None
|
196
|
+
bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
|
197
|
+
bootstrap_room: Optional[Union[List[int], int]] = None
|
198
198
|
|
199
199
|
# For request id
|
200
200
|
rid: Optional[Union[List[str], str]] = None
|
@@ -55,6 +55,20 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
55
55
|
def _request_id_prefix(self) -> str:
|
56
56
|
return "chatcmpl-"
|
57
57
|
|
58
|
+
def _validate_request(self, request: ChatCompletionRequest) -> Optional[str]:
|
59
|
+
"""Validate that the input is valid."""
|
60
|
+
if not request.messages:
|
61
|
+
return "Messages cannot be empty."
|
62
|
+
|
63
|
+
if (
|
64
|
+
isinstance(request.tool_choice, str)
|
65
|
+
and request.tool_choice.lower() == "required"
|
66
|
+
and not request.tools
|
67
|
+
):
|
68
|
+
return "Tools cannot be empty if tool choice is set to required."
|
69
|
+
|
70
|
+
return None
|
71
|
+
|
58
72
|
def _convert_to_internal_request(
|
59
73
|
self,
|
60
74
|
request: ChatCompletionRequest,
|
@@ -398,6 +412,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
398
412
|
is_firsts = {}
|
399
413
|
stream_buffers = {}
|
400
414
|
n_prev_tokens = {}
|
415
|
+
has_tool_calls = {}
|
416
|
+
finish_reasons = {}
|
401
417
|
|
402
418
|
# Usage tracking
|
403
419
|
prompt_tokens = {}
|
@@ -429,6 +445,10 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
429
445
|
finish_reason = content["meta_info"]["finish_reason"]
|
430
446
|
finish_reason_type = finish_reason["type"] if finish_reason else None
|
431
447
|
|
448
|
+
# Track finish_reason for each index
|
449
|
+
if finish_reason_type:
|
450
|
+
finish_reasons[index] = finish_reason
|
451
|
+
|
432
452
|
# First chunk with role
|
433
453
|
if is_firsts.get(index, True):
|
434
454
|
is_firsts[index] = False
|
@@ -436,13 +456,8 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
436
456
|
choice_data = ChatCompletionResponseStreamChoice(
|
437
457
|
index=index,
|
438
458
|
delta=delta,
|
439
|
-
finish_reason=
|
440
|
-
|
441
|
-
finish_reason["matched"]
|
442
|
-
if finish_reason and "matched" in finish_reason
|
443
|
-
else None
|
444
|
-
),
|
445
|
-
logprobs=choice_logprobs,
|
459
|
+
finish_reason=None,
|
460
|
+
logprobs=None,
|
446
461
|
)
|
447
462
|
chunk = ChatCompletionStreamResponse(
|
448
463
|
id=content["meta_info"]["id"],
|
@@ -469,7 +484,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
469
484
|
choice_data = ChatCompletionResponseStreamChoice(
|
470
485
|
index=index,
|
471
486
|
delta=DeltaMessage(reasoning_content=reasoning_text),
|
472
|
-
finish_reason=
|
487
|
+
finish_reason=None,
|
473
488
|
)
|
474
489
|
chunk = ChatCompletionStreamResponse(
|
475
490
|
id=content["meta_info"]["id"],
|
@@ -479,9 +494,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
479
494
|
)
|
480
495
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
481
496
|
|
482
|
-
if not delta:
|
483
|
-
continue
|
484
|
-
|
485
497
|
# Handle tool calls
|
486
498
|
if request.tool_choice != "none" and request.tools:
|
487
499
|
async for chunk in self._process_tool_call_stream(
|
@@ -490,28 +502,28 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
490
502
|
parser_dict,
|
491
503
|
content,
|
492
504
|
request,
|
493
|
-
|
505
|
+
has_tool_calls,
|
494
506
|
):
|
495
|
-
|
507
|
+
if chunk:
|
508
|
+
yield chunk
|
509
|
+
|
510
|
+
# Send any remaining tool call arguments when generation finishes
|
511
|
+
if finish_reason_type is not None and index in parser_dict:
|
512
|
+
parser = parser_dict[index]
|
513
|
+
remaining_chunk = self._check_for_unstreamed_tool_args(
|
514
|
+
parser, content, request, index
|
515
|
+
)
|
516
|
+
if remaining_chunk:
|
517
|
+
yield remaining_chunk
|
518
|
+
|
496
519
|
else:
|
497
520
|
# Regular content
|
498
|
-
if delta
|
499
|
-
request.stream_options and request.stream_options.include_usage
|
500
|
-
):
|
521
|
+
if delta:
|
501
522
|
choice_data = ChatCompletionResponseStreamChoice(
|
502
523
|
index=index,
|
503
524
|
delta=DeltaMessage(content=delta if delta else None),
|
504
|
-
finish_reason=
|
505
|
-
|
506
|
-
if request.stream_options
|
507
|
-
and request.stream_options.include_usage
|
508
|
-
else finish_reason_type
|
509
|
-
),
|
510
|
-
matched_stop=(
|
511
|
-
finish_reason["matched"]
|
512
|
-
if finish_reason and "matched" in finish_reason
|
513
|
-
else None
|
514
|
-
),
|
525
|
+
finish_reason=None,
|
526
|
+
matched_stop=None,
|
515
527
|
logprobs=choice_logprobs,
|
516
528
|
)
|
517
529
|
chunk = ChatCompletionStreamResponse(
|
@@ -522,26 +534,36 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
522
534
|
)
|
523
535
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
524
536
|
|
525
|
-
#
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
537
|
+
# Send finish_reason chunks for each index that completed
|
538
|
+
for idx, finish_reason_data in finish_reasons.items():
|
539
|
+
finish_reason_type = finish_reason_data["type"]
|
540
|
+
|
541
|
+
# Change finish_reason to "tool_calls" if we had tool calls and stopped naturally
|
542
|
+
final_finish_reason = finish_reason_type
|
543
|
+
if has_tool_calls.get(idx, False) and finish_reason_type == "stop":
|
544
|
+
final_finish_reason = "tool_calls"
|
545
|
+
|
546
|
+
finish_reason_chunk = ChatCompletionStreamResponse(
|
547
|
+
id=content["meta_info"][
|
548
|
+
"id"
|
549
|
+
], # NOTE: openai uses the same chatcmpl-id for all indices
|
550
|
+
created=int(time.time()),
|
551
|
+
choices=[
|
552
|
+
ChatCompletionResponseStreamChoice(
|
553
|
+
index=idx,
|
554
|
+
delta=DeltaMessage(),
|
555
|
+
finish_reason=final_finish_reason,
|
556
|
+
matched_stop=(
|
557
|
+
finish_reason_data["matched"]
|
558
|
+
if "matched" in finish_reason_data
|
559
|
+
else None
|
560
|
+
),
|
561
|
+
)
|
562
|
+
],
|
563
|
+
model=request.model,
|
564
|
+
usage=None,
|
565
|
+
)
|
566
|
+
yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
|
545
567
|
|
546
568
|
# Send hidden states if requested
|
547
569
|
if request.return_hidden_states and hidden_states:
|
@@ -561,7 +583,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
561
583
|
delta=DeltaMessage(
|
562
584
|
hidden_states=last_token_hidden_states
|
563
585
|
),
|
564
|
-
finish_reason=
|
586
|
+
finish_reason=None, # Hidden states don't need finish_reason
|
565
587
|
)
|
566
588
|
],
|
567
589
|
model=request.model,
|
@@ -840,7 +862,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
840
862
|
parser_dict: Dict[int, FunctionCallParser],
|
841
863
|
content: Dict[str, Any],
|
842
864
|
request: ChatCompletionRequest,
|
843
|
-
|
865
|
+
has_tool_calls: Dict[int, bool],
|
844
866
|
):
|
845
867
|
"""Process tool calls in streaming response"""
|
846
868
|
if index not in parser_dict:
|
@@ -857,7 +879,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
857
879
|
choice_data = ChatCompletionResponseStreamChoice(
|
858
880
|
index=index,
|
859
881
|
delta=DeltaMessage(content=normal_text),
|
860
|
-
finish_reason=
|
882
|
+
finish_reason=None,
|
861
883
|
)
|
862
884
|
chunk = ChatCompletionStreamResponse(
|
863
885
|
id=content["meta_info"]["id"],
|
@@ -869,6 +891,9 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
869
891
|
|
870
892
|
# Yield tool calls
|
871
893
|
for call_item in calls:
|
894
|
+
# Mark that this choice has tool calls
|
895
|
+
has_tool_calls[index] = True
|
896
|
+
|
872
897
|
# Tool call ID should be generated only once per tool call
|
873
898
|
if call_item.name:
|
874
899
|
# First chunk: include ID and function name
|
@@ -879,23 +904,6 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
879
904
|
tool_call_id = None
|
880
905
|
function_name = None
|
881
906
|
|
882
|
-
if finish_reason_type == "stop":
|
883
|
-
# Handle remaining arguments
|
884
|
-
latest_delta_len = 0
|
885
|
-
if isinstance(call_item.parameters, str):
|
886
|
-
latest_delta_len = len(call_item.parameters)
|
887
|
-
|
888
|
-
expected_call = json.dumps(
|
889
|
-
parser.detector.prev_tool_call_arr[index].get("arguments", {}),
|
890
|
-
ensure_ascii=False,
|
891
|
-
)
|
892
|
-
actual_call = parser.detector.streamed_args_for_tool[index]
|
893
|
-
if latest_delta_len > 0:
|
894
|
-
actual_call = actual_call[:-latest_delta_len]
|
895
|
-
remaining_call = expected_call.replace(actual_call, "", 1)
|
896
|
-
call_item.parameters = remaining_call
|
897
|
-
finish_reason_type = "tool_calls"
|
898
|
-
|
899
907
|
tool_call = ToolCall(
|
900
908
|
id=tool_call_id,
|
901
909
|
index=call_item.tool_index,
|
@@ -908,11 +916,7 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
908
916
|
choice_data = ChatCompletionResponseStreamChoice(
|
909
917
|
index=index,
|
910
918
|
delta=DeltaMessage(tool_calls=[tool_call]),
|
911
|
-
finish_reason=
|
912
|
-
None
|
913
|
-
if request.stream_options and request.stream_options.include_usage
|
914
|
-
else finish_reason_type
|
915
|
-
),
|
919
|
+
finish_reason=None,
|
916
920
|
)
|
917
921
|
chunk = ChatCompletionStreamResponse(
|
918
922
|
id=content["meta_info"]["id"],
|
@@ -921,3 +925,75 @@ class OpenAIServingChat(OpenAIServingBase):
|
|
921
925
|
model=request.model,
|
922
926
|
)
|
923
927
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
928
|
+
|
929
|
+
def _check_for_unstreamed_tool_args(
|
930
|
+
self,
|
931
|
+
parser: FunctionCallParser,
|
932
|
+
content: Dict[str, Any],
|
933
|
+
request: ChatCompletionRequest,
|
934
|
+
index: int,
|
935
|
+
) -> Optional[str]:
|
936
|
+
"""
|
937
|
+
Check for any remaining tool call arguments that need to be streamed
|
938
|
+
when generation finishes. This ensures tool calls are properly completed
|
939
|
+
even if the model generates the final arguments in the last chunk.
|
940
|
+
"""
|
941
|
+
# Only check if we have tool calls and the parser has tracked data
|
942
|
+
if (
|
943
|
+
not hasattr(parser.detector, "prev_tool_call_arr")
|
944
|
+
or not parser.detector.prev_tool_call_arr
|
945
|
+
):
|
946
|
+
return None
|
947
|
+
|
948
|
+
if (
|
949
|
+
not hasattr(parser.detector, "streamed_args_for_tool")
|
950
|
+
or not parser.detector.streamed_args_for_tool
|
951
|
+
):
|
952
|
+
return None
|
953
|
+
|
954
|
+
# Get the last tool call that was being processed
|
955
|
+
tool_index = len(parser.detector.prev_tool_call_arr) - 1
|
956
|
+
if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
|
957
|
+
return None
|
958
|
+
|
959
|
+
# Get expected vs actual arguments
|
960
|
+
expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
|
961
|
+
"arguments", {}
|
962
|
+
)
|
963
|
+
expected_call = json.dumps(expected_args, ensure_ascii=False)
|
964
|
+
actual_call = parser.detector.streamed_args_for_tool[tool_index]
|
965
|
+
|
966
|
+
# Check if there are remaining arguments to send
|
967
|
+
remaining_call = (
|
968
|
+
expected_call.replace(actual_call, "", 1)
|
969
|
+
if actual_call in expected_call
|
970
|
+
else ""
|
971
|
+
)
|
972
|
+
|
973
|
+
if remaining_call:
|
974
|
+
# Create tool call chunk with remaining arguments
|
975
|
+
tool_call = ToolCall(
|
976
|
+
id=None, # No ID for argument deltas
|
977
|
+
index=tool_index,
|
978
|
+
function=FunctionResponse(
|
979
|
+
name=None, # No name for argument deltas
|
980
|
+
arguments=remaining_call,
|
981
|
+
),
|
982
|
+
)
|
983
|
+
|
984
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
985
|
+
index=index,
|
986
|
+
delta=DeltaMessage(tool_calls=[tool_call]),
|
987
|
+
finish_reason=None, # Don't send finish_reason with this chunk
|
988
|
+
)
|
989
|
+
|
990
|
+
chunk = ChatCompletionStreamResponse(
|
991
|
+
id=content["meta_info"]["id"],
|
992
|
+
created=int(time.time()),
|
993
|
+
choices=[choice_data],
|
994
|
+
model=request.model,
|
995
|
+
)
|
996
|
+
|
997
|
+
return f"data: {chunk.model_dump_json()}\n\n"
|
998
|
+
|
999
|
+
return None
|
@@ -25,23 +25,49 @@ class BaseFormatDetector(ABC):
|
|
25
25
|
"""Base class providing two sets of interfaces: one-time and streaming incremental."""
|
26
26
|
|
27
27
|
def __init__(self):
|
28
|
-
#
|
28
|
+
# Streaming state management
|
29
|
+
# Buffer for accumulating incomplete patterns that arrive across multiple streaming chunks
|
29
30
|
self._buffer = ""
|
30
|
-
#
|
31
|
+
# Stores complete tool call info (name and arguments) for each tool being parsed.
|
32
|
+
# Used by serving layer for completion handling when streaming ends.
|
33
|
+
# Format: [{"name": str, "arguments": dict}, ...]
|
31
34
|
self.prev_tool_call_arr: List[Dict] = []
|
35
|
+
# Index of currently streaming tool call. Starts at -1 (no active tool),
|
36
|
+
# increments as each tool completes. Tracks which tool's arguments are streaming.
|
32
37
|
self.current_tool_id: int = -1
|
38
|
+
# Flag for whether current tool's name has been sent to client.
|
39
|
+
# Tool names sent first with empty parameters, then arguments stream incrementally.
|
33
40
|
self.current_tool_name_sent: bool = False
|
34
|
-
|
35
|
-
|
36
|
-
|
41
|
+
# Tracks raw JSON string content streamed to client for each tool's arguments.
|
42
|
+
# Critical for serving layer to calculate remaining content when streaming ends.
|
43
|
+
# Each index corresponds to a tool_id. Example: ['{"location": "San Francisco"', '{"temp": 72']
|
44
|
+
self.streamed_args_for_tool: List[str] = []
|
45
|
+
|
46
|
+
# Token configuration (override in subclasses)
|
37
47
|
self.bot_token = ""
|
38
48
|
self.eot_token = ""
|
39
49
|
self.tool_call_separator = ", "
|
40
50
|
|
41
|
-
def
|
42
|
-
|
51
|
+
def _get_tool_indices(self, tools: List[Tool]) -> Dict[str, int]:
|
52
|
+
"""
|
53
|
+
Get a mapping of tool names to their indices in the tools list.
|
54
|
+
|
55
|
+
This utility method creates a dictionary mapping function names to their
|
56
|
+
indices in the tools list, which is commonly needed for tool validation
|
57
|
+
and ToolCallItem creation.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
tools: List of available tools
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
Dictionary mapping tool names to their indices
|
64
|
+
"""
|
65
|
+
return {
|
43
66
|
tool.function.name: i for i, tool in enumerate(tools) if tool.function.name
|
44
67
|
}
|
68
|
+
|
69
|
+
def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
|
70
|
+
tool_indices = self._get_tool_indices(tools)
|
45
71
|
if not isinstance(action, list):
|
46
72
|
action = [action]
|
47
73
|
|
@@ -130,11 +156,7 @@ class BaseFormatDetector(ABC):
|
|
130
156
|
|
131
157
|
# Build tool indices if not already built
|
132
158
|
if not hasattr(self, "_tool_indices"):
|
133
|
-
self._tool_indices =
|
134
|
-
tool.function.name: i
|
135
|
-
for i, tool in enumerate(tools)
|
136
|
-
if tool.function and tool.function.name
|
137
|
-
}
|
159
|
+
self._tool_indices = self._get_tool_indices(tools)
|
138
160
|
|
139
161
|
flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
|
140
162
|
|
@@ -294,12 +316,52 @@ class BaseFormatDetector(ABC):
|
|
294
316
|
|
295
317
|
@abstractmethod
|
296
318
|
def has_tool_call(self, text: str) -> bool:
|
319
|
+
"""
|
320
|
+
Check if the given text contains function call markers specific to this format.
|
321
|
+
"""
|
297
322
|
raise NotImplementedError()
|
298
323
|
|
324
|
+
def supports_structural_tag(self) -> bool:
|
325
|
+
"""Return True if this detector supports structural tag format."""
|
326
|
+
return True
|
327
|
+
|
299
328
|
@abstractmethod
|
300
329
|
def structure_info(self) -> _GetInfoFunc:
|
330
|
+
"""
|
331
|
+
Return a function that creates StructureInfo for constrained generation.
|
332
|
+
|
333
|
+
The returned function takes a tool name and returns a StructureInfo object
|
334
|
+
containing the begin/end patterns and trigger tokens needed for constrained
|
335
|
+
generation of function calls in this format.
|
336
|
+
|
337
|
+
Returns:
|
338
|
+
A function that takes a tool name (str) and returns StructureInfo
|
339
|
+
"""
|
301
340
|
raise NotImplementedError()
|
302
341
|
|
303
342
|
@abstractmethod
|
304
343
|
def build_ebnf(self, tools: List[Tool]) -> str:
|
344
|
+
"""
|
345
|
+
Build an EBNF grammar for constrained generation of function calls.
|
346
|
+
|
347
|
+
This method generates an Extended Backus-Naur Form (EBNF) grammar that
|
348
|
+
constrains the model's output to valid function calls in this format.
|
349
|
+
The grammar should include all available tools and their parameter schemas.
|
350
|
+
|
351
|
+
Args:
|
352
|
+
tools: List of available tools/functions that can be called
|
353
|
+
|
354
|
+
Returns:
|
355
|
+
A string containing the EBNF grammar for this function call format
|
356
|
+
|
357
|
+
The EBNF grammar should:
|
358
|
+
- Define the overall structure of function calls in this format
|
359
|
+
- Include all tool names from the provided tools list
|
360
|
+
- Define valid JSON structures for function arguments
|
361
|
+
- Handle multiple function calls if the format supports them
|
362
|
+
|
363
|
+
Note:
|
364
|
+
Most implementations use EBNFComposer.build_ebnf() utility with
|
365
|
+
format-specific parameters rather than writing EBNF from scratch.
|
366
|
+
"""
|
305
367
|
raise NotImplementedError()
|
@@ -19,9 +19,28 @@ logger = logging.getLogger(__name__)
|
|
19
19
|
|
20
20
|
class DeepSeekV3Detector(BaseFormatDetector):
|
21
21
|
"""
|
22
|
-
Detector for DeepSeek
|
23
|
-
|
24
|
-
|
22
|
+
Detector for DeepSeek V3 model function call format.
|
23
|
+
|
24
|
+
The DeepSeek V3 format uses special Unicode tokens to delimit function calls
|
25
|
+
with JSON code blocks for arguments.
|
26
|
+
|
27
|
+
Format Structure:
|
28
|
+
```
|
29
|
+
<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>{function_name}\n```json\n{json_arguments}\n```<|tool▁calls▁end|><|end▁of▁sentence|>
|
30
|
+
```
|
31
|
+
Examples:
|
32
|
+
```
|
33
|
+
<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Tokyo"}\n```<|tool▁call▁end|>\n<|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Paris"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>
|
34
|
+
```
|
35
|
+
|
36
|
+
Key Components:
|
37
|
+
- Tool Calls Section: Wrapped between `<|tool▁calls▁begin|>` and `<|tool▁calls▁end|>`
|
38
|
+
- Individual Tool Call: Wrapped between `<|tool▁call▁begin|>` and `<|tool▁call▁end|>`
|
39
|
+
- Function Declaration: `function<|tool▁sep|>{function_name}`
|
40
|
+
- Arguments: JSON code block between ````json` and ````
|
41
|
+
- Supports multiple tool calls
|
42
|
+
|
43
|
+
Reference: https://huggingface.co/deepseek-ai/DeepSeek-V3-0324?chat_template=default
|
25
44
|
"""
|
26
45
|
|
27
46
|
def __init__(self):
|
@@ -89,16 +108,12 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
89
108
|
return StreamingParseResult(normal_text=new_text)
|
90
109
|
|
91
110
|
if not hasattr(self, "_tool_indices"):
|
92
|
-
self._tool_indices =
|
93
|
-
tool.function.name: i
|
94
|
-
for i, tool in enumerate(tools)
|
95
|
-
if tool.function and tool.function.name
|
96
|
-
}
|
111
|
+
self._tool_indices = self._get_tool_indices(tools)
|
97
112
|
|
98
113
|
calls: list[ToolCallItem] = []
|
99
114
|
try:
|
100
115
|
partial_match = re.search(
|
101
|
-
pattern=r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)",
|
116
|
+
pattern=r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)\n```.*",
|
102
117
|
string=current_text,
|
103
118
|
flags=re.DOTALL,
|
104
119
|
)
|
@@ -127,7 +142,7 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
127
142
|
)
|
128
143
|
)
|
129
144
|
self.current_tool_name_sent = True
|
130
|
-
# Store the tool call info for
|
145
|
+
# Store the tool call info for serving layer completions endpoint
|
131
146
|
self.prev_tool_call_arr[self.current_tool_id] = {
|
132
147
|
"name": func_name,
|
133
148
|
"arguments": {},
|
@@ -153,7 +168,7 @@ class DeepSeekV3Detector(BaseFormatDetector):
|
|
153
168
|
] += argument_diff
|
154
169
|
|
155
170
|
if _is_complete_json(func_args_raw):
|
156
|
-
# Update the stored arguments
|
171
|
+
# Update the stored arguments
|
157
172
|
try:
|
158
173
|
parsed_args = json.loads(func_args_raw)
|
159
174
|
self.prev_tool_call_arr[self.current_tool_id][
|