sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. sglang/lang/chat_template.py +21 -0
  2. sglang/srt/configs/internvl.py +3 -0
  3. sglang/srt/configs/model_config.py +7 -0
  4. sglang/srt/constrained/base_grammar_backend.py +10 -2
  5. sglang/srt/constrained/xgrammar_backend.py +7 -5
  6. sglang/srt/conversation.py +16 -1
  7. sglang/srt/debug_utils/__init__.py +0 -0
  8. sglang/srt/debug_utils/dump_comparator.py +131 -0
  9. sglang/srt/debug_utils/dumper.py +108 -0
  10. sglang/srt/debug_utils/text_comparator.py +172 -0
  11. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
  12. sglang/srt/disaggregation/mooncake/conn.py +16 -0
  13. sglang/srt/disaggregation/prefill.py +13 -1
  14. sglang/srt/entrypoints/engine.py +4 -2
  15. sglang/srt/entrypoints/http_server.py +13 -1
  16. sglang/srt/entrypoints/openai/protocol.py +3 -1
  17. sglang/srt/entrypoints/openai/serving_base.py +5 -2
  18. sglang/srt/entrypoints/openai/serving_chat.py +132 -79
  19. sglang/srt/function_call/ebnf_composer.py +10 -3
  20. sglang/srt/function_call/function_call_parser.py +2 -0
  21. sglang/srt/function_call/glm4_moe_detector.py +164 -0
  22. sglang/srt/function_call/qwen3_coder_detector.py +1 -0
  23. sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
  24. sglang/srt/layers/attention/vision.py +56 -8
  25. sglang/srt/layers/layernorm.py +26 -1
  26. sglang/srt/layers/logits_processor.py +14 -3
  27. sglang/srt/layers/moe/ep_moe/layer.py +323 -242
  28. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
  29. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
  33. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
  34. sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
  35. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
  36. sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
  37. sglang/srt/layers/moe/topk.py +90 -24
  38. sglang/srt/layers/multimodal.py +11 -8
  39. sglang/srt/layers/quantization/fp8.py +25 -247
  40. sglang/srt/layers/quantization/fp8_kernel.py +78 -48
  41. sglang/srt/layers/quantization/modelopt_quant.py +27 -10
  42. sglang/srt/layers/quantization/unquant.py +24 -76
  43. sglang/srt/layers/quantization/w4afp8.py +68 -17
  44. sglang/srt/lora/lora_registry.py +93 -29
  45. sglang/srt/managers/cache_controller.py +9 -7
  46. sglang/srt/managers/data_parallel_controller.py +4 -0
  47. sglang/srt/managers/io_struct.py +12 -0
  48. sglang/srt/managers/mm_utils.py +154 -35
  49. sglang/srt/managers/multimodal_processor.py +3 -14
  50. sglang/srt/managers/schedule_batch.py +14 -8
  51. sglang/srt/managers/scheduler.py +64 -1
  52. sglang/srt/managers/scheduler_input_blocker.py +106 -0
  53. sglang/srt/managers/tokenizer_manager.py +80 -15
  54. sglang/srt/managers/tp_worker.py +8 -0
  55. sglang/srt/mem_cache/hiradix_cache.py +5 -2
  56. sglang/srt/model_executor/model_runner.py +83 -27
  57. sglang/srt/models/deepseek_v2.py +75 -84
  58. sglang/srt/models/glm4_moe.py +1035 -0
  59. sglang/srt/models/glm4_moe_nextn.py +167 -0
  60. sglang/srt/models/interns1.py +328 -0
  61. sglang/srt/models/internvl.py +143 -47
  62. sglang/srt/models/llava.py +9 -5
  63. sglang/srt/models/minicpmo.py +4 -1
  64. sglang/srt/models/qwen2_moe.py +2 -2
  65. sglang/srt/models/qwen3_moe.py +17 -71
  66. sglang/srt/multimodal/processors/base_processor.py +20 -6
  67. sglang/srt/multimodal/processors/clip.py +2 -2
  68. sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
  69. sglang/srt/multimodal/processors/gemma3.py +2 -2
  70. sglang/srt/multimodal/processors/gemma3n.py +2 -2
  71. sglang/srt/multimodal/processors/internvl.py +21 -8
  72. sglang/srt/multimodal/processors/janus_pro.py +2 -2
  73. sglang/srt/multimodal/processors/kimi_vl.py +2 -2
  74. sglang/srt/multimodal/processors/llava.py +4 -4
  75. sglang/srt/multimodal/processors/minicpm.py +2 -3
  76. sglang/srt/multimodal/processors/mlama.py +2 -2
  77. sglang/srt/multimodal/processors/mllama4.py +18 -111
  78. sglang/srt/multimodal/processors/phi4mm.py +2 -2
  79. sglang/srt/multimodal/processors/pixtral.py +2 -2
  80. sglang/srt/multimodal/processors/qwen_audio.py +2 -2
  81. sglang/srt/multimodal/processors/qwen_vl.py +2 -2
  82. sglang/srt/multimodal/processors/vila.py +3 -1
  83. sglang/srt/poll_based_barrier.py +31 -0
  84. sglang/srt/reasoning_parser.py +2 -1
  85. sglang/srt/server_args.py +65 -6
  86. sglang/srt/two_batch_overlap.py +8 -3
  87. sglang/srt/utils.py +96 -1
  88. sglang/srt/weight_sync/utils.py +119 -0
  89. sglang/test/runners.py +4 -0
  90. sglang/test/test_utils.py +118 -5
  91. sglang/utils.py +19 -0
  92. sglang/version.py +1 -1
  93. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/METADATA +5 -4
  94. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/RECORD +97 -80
  95. sglang/srt/debug_utils.py +0 -74
  96. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/WHEEL +0 -0
  97. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/licenses/LICENSE +0 -0
  98. {sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/top_level.txt +0 -0
@@ -640,7 +640,7 @@ def _set_envs_and_config(server_args: ServerArgs):
640
640
  if server_args.attention_backend == "flashinfer":
641
641
  assert_pkg_version(
642
642
  "flashinfer_python",
643
- "0.2.9rc1",
643
+ "0.2.9rc2",
644
644
  "Please uninstall the old version and "
645
645
  "reinstall the latest version by following the instructions "
646
646
  "at https://docs.flashinfer.ai/installation.html.",
@@ -765,7 +765,9 @@ def _launch_subprocesses(
765
765
  # When using `Engine` as a Python API, we don't want to block here.
766
766
  return None, None, None
767
767
 
768
- launch_dummy_health_check_server(server_args.host, server_args.port)
768
+ launch_dummy_health_check_server(
769
+ server_args.host, server_args.port, server_args.enable_metrics
770
+ )
769
771
 
770
772
  for proc in scheduler_procs:
771
773
  proc.join()
@@ -38,7 +38,7 @@ import orjson
38
38
  import requests
39
39
  import uvicorn
40
40
  import uvloop
41
- from fastapi import Depends, FastAPI, Request, UploadFile
41
+ from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
42
42
  from fastapi.exceptions import RequestValidationError
43
43
  from fastapi.middleware.cors import CORSMiddleware
44
44
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
@@ -174,6 +174,18 @@ app.add_middleware(
174
174
  )
175
175
 
176
176
 
177
+ @app.exception_handler(HTTPException)
178
+ async def validation_exception_handler(request: Request, exc: HTTPException):
179
+ """Enrich HTTP exception with status code and other details"""
180
+ error = ErrorResponse(
181
+ object="error",
182
+ message=exc.detail,
183
+ type=str(exc.status_code),
184
+ code=exc.status_code,
185
+ )
186
+ return ORJSONResponse(content=error.model_dump(), status_code=exc.status_code)
187
+
188
+
177
189
  # Custom exception handlers to change validation error status codes
178
190
  @app.exception_handler(RequestValidationError)
179
191
  async def validation_exception_handler(request: Request, exc: RequestValidationError):
@@ -317,7 +317,9 @@ class ToolCall(BaseModel):
317
317
 
318
318
  class ChatCompletionMessageGenericParam(BaseModel):
319
319
  role: Literal["system", "assistant", "tool"]
320
- content: Union[str, List[ChatCompletionMessageContentTextPart], None]
320
+ content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
321
+ default=None
322
+ )
321
323
  tool_call_id: Optional[str] = None
322
324
  name: Optional[str] = None
323
325
  reasoning_content: Optional[str] = None
@@ -4,7 +4,7 @@ import uuid
4
4
  from abc import ABC, abstractmethod
5
5
  from typing import Any, Optional, Union
6
6
 
7
- from fastapi import Request
7
+ from fastapi import HTTPException, Request
8
8
  from fastapi.responses import ORJSONResponse, StreamingResponse
9
9
 
10
10
  from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
@@ -45,7 +45,10 @@ class OpenAIServingBase(ABC):
45
45
  return await self._handle_non_streaming_request(
46
46
  adapted_request, processed_request, raw_request
47
47
  )
48
-
48
+ except HTTPException as e:
49
+ return self.create_error_response(
50
+ message=e.detail, err_type=str(e.status_code), status_code=e.status_code
51
+ )
49
52
  except Exception as e:
50
53
  logger.exception(f"Error in request: {e}")
51
54
  return self.create_error_response(
@@ -412,6 +412,8 @@ class OpenAIServingChat(OpenAIServingBase):
412
412
  is_firsts = {}
413
413
  stream_buffers = {}
414
414
  n_prev_tokens = {}
415
+ has_tool_calls = {}
416
+ finish_reasons = {}
415
417
 
416
418
  # Usage tracking
417
419
  prompt_tokens = {}
@@ -443,6 +445,10 @@ class OpenAIServingChat(OpenAIServingBase):
443
445
  finish_reason = content["meta_info"]["finish_reason"]
444
446
  finish_reason_type = finish_reason["type"] if finish_reason else None
445
447
 
448
+ # Track finish_reason for each index
449
+ if finish_reason_type:
450
+ finish_reasons[index] = finish_reason
451
+
446
452
  # First chunk with role
447
453
  if is_firsts.get(index, True):
448
454
  is_firsts[index] = False
@@ -450,13 +456,8 @@ class OpenAIServingChat(OpenAIServingBase):
450
456
  choice_data = ChatCompletionResponseStreamChoice(
451
457
  index=index,
452
458
  delta=delta,
453
- finish_reason=finish_reason_type,
454
- matched_stop=(
455
- finish_reason["matched"]
456
- if finish_reason and "matched" in finish_reason
457
- else None
458
- ),
459
- logprobs=choice_logprobs,
459
+ finish_reason=None,
460
+ logprobs=None,
460
461
  )
461
462
  chunk = ChatCompletionStreamResponse(
462
463
  id=content["meta_info"]["id"],
@@ -483,7 +484,7 @@ class OpenAIServingChat(OpenAIServingBase):
483
484
  choice_data = ChatCompletionResponseStreamChoice(
484
485
  index=index,
485
486
  delta=DeltaMessage(reasoning_content=reasoning_text),
486
- finish_reason=finish_reason_type,
487
+ finish_reason=None,
487
488
  )
488
489
  chunk = ChatCompletionStreamResponse(
489
490
  id=content["meta_info"]["id"],
@@ -493,45 +494,36 @@ class OpenAIServingChat(OpenAIServingBase):
493
494
  )
494
495
  yield f"data: {chunk.model_dump_json()}\n\n"
495
496
 
496
- if not delta:
497
- continue
498
-
499
497
  # Handle tool calls
500
498
  if request.tool_choice != "none" and request.tools:
501
- async for (
502
- chunk,
503
- tool_call_finish_reason_type,
504
- ) in self._process_tool_call_stream(
499
+ async for chunk in self._process_tool_call_stream(
505
500
  index,
506
501
  delta,
507
502
  parser_dict,
508
503
  content,
509
504
  request,
510
- finish_reason_type,
505
+ has_tool_calls,
511
506
  ):
512
507
  if chunk:
513
508
  yield chunk
514
- finish_reason_type = tool_call_finish_reason_type
509
+
510
+ # Send any remaining tool call arguments when generation finishes
511
+ if finish_reason_type is not None and index in parser_dict:
512
+ parser = parser_dict[index]
513
+ remaining_chunk = self._check_for_unstreamed_tool_args(
514
+ parser, content, request, index
515
+ )
516
+ if remaining_chunk:
517
+ yield remaining_chunk
515
518
 
516
519
  else:
517
520
  # Regular content
518
- if delta or not (
519
- request.stream_options and request.stream_options.include_usage
520
- ):
521
+ if delta:
521
522
  choice_data = ChatCompletionResponseStreamChoice(
522
523
  index=index,
523
524
  delta=DeltaMessage(content=delta if delta else None),
524
- finish_reason=(
525
- None
526
- if request.stream_options
527
- and request.stream_options.include_usage
528
- else finish_reason_type
529
- ),
530
- matched_stop=(
531
- finish_reason["matched"]
532
- if finish_reason and "matched" in finish_reason
533
- else None
534
- ),
525
+ finish_reason=None,
526
+ matched_stop=None,
535
527
  logprobs=choice_logprobs,
536
528
  )
537
529
  chunk = ChatCompletionStreamResponse(
@@ -542,26 +534,36 @@ class OpenAIServingChat(OpenAIServingBase):
542
534
  )
543
535
  yield f"data: {chunk.model_dump_json()}\n\n"
544
536
 
545
- # Final chunk with finish_reason
546
- finish_reason_chunk = ChatCompletionStreamResponse(
547
- id=content["meta_info"]["id"],
548
- created=int(time.time()),
549
- choices=[
550
- ChatCompletionResponseStreamChoice(
551
- index=index,
552
- delta=DeltaMessage(),
553
- finish_reason=finish_reason_type,
554
- matched_stop=(
555
- finish_reason["matched"]
556
- if finish_reason and "matched" in finish_reason
557
- else None
558
- ),
559
- )
560
- ],
561
- model=request.model,
562
- usage=None,
563
- )
564
- yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
537
+ # Send finish_reason chunks for each index that completed
538
+ for idx, finish_reason_data in finish_reasons.items():
539
+ finish_reason_type = finish_reason_data["type"]
540
+
541
+ # Change finish_reason to "tool_calls" if we had tool calls and stopped naturally
542
+ final_finish_reason = finish_reason_type
543
+ if has_tool_calls.get(idx, False) and finish_reason_type == "stop":
544
+ final_finish_reason = "tool_calls"
545
+
546
+ finish_reason_chunk = ChatCompletionStreamResponse(
547
+ id=content["meta_info"][
548
+ "id"
549
+ ], # NOTE: openai uses the same chatcmpl-id for all indices
550
+ created=int(time.time()),
551
+ choices=[
552
+ ChatCompletionResponseStreamChoice(
553
+ index=idx,
554
+ delta=DeltaMessage(),
555
+ finish_reason=final_finish_reason,
556
+ matched_stop=(
557
+ finish_reason_data["matched"]
558
+ if "matched" in finish_reason_data
559
+ else None
560
+ ),
561
+ )
562
+ ],
563
+ model=request.model,
564
+ usage=None,
565
+ )
566
+ yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
565
567
 
566
568
  # Send hidden states if requested
567
569
  if request.return_hidden_states and hidden_states:
@@ -581,7 +583,7 @@ class OpenAIServingChat(OpenAIServingBase):
581
583
  delta=DeltaMessage(
582
584
  hidden_states=last_token_hidden_states
583
585
  ),
584
- finish_reason=finish_reason_type,
586
+ finish_reason=None, # Hidden states don't need finish_reason
585
587
  )
586
588
  ],
587
589
  model=request.model,
@@ -860,7 +862,7 @@ class OpenAIServingChat(OpenAIServingBase):
860
862
  parser_dict: Dict[int, FunctionCallParser],
861
863
  content: Dict[str, Any],
862
864
  request: ChatCompletionRequest,
863
- finish_reason_type: Optional[str],
865
+ has_tool_calls: Dict[int, bool],
864
866
  ):
865
867
  """Process tool calls in streaming response"""
866
868
  if index not in parser_dict:
@@ -877,7 +879,7 @@ class OpenAIServingChat(OpenAIServingBase):
877
879
  choice_data = ChatCompletionResponseStreamChoice(
878
880
  index=index,
879
881
  delta=DeltaMessage(content=normal_text),
880
- finish_reason=finish_reason_type,
882
+ finish_reason=None,
881
883
  )
882
884
  chunk = ChatCompletionStreamResponse(
883
885
  id=content["meta_info"]["id"],
@@ -885,10 +887,13 @@ class OpenAIServingChat(OpenAIServingBase):
885
887
  choices=[choice_data],
886
888
  model=request.model,
887
889
  )
888
- yield f"data: {chunk.model_dump_json()}\n\n", finish_reason_type
890
+ yield f"data: {chunk.model_dump_json()}\n\n"
889
891
 
890
892
  # Yield tool calls
891
893
  for call_item in calls:
894
+ # Mark that this choice has tool calls
895
+ has_tool_calls[index] = True
896
+
892
897
  # Tool call ID should be generated only once per tool call
893
898
  if call_item.name:
894
899
  # First chunk: include ID and function name
@@ -899,23 +904,6 @@ class OpenAIServingChat(OpenAIServingBase):
899
904
  tool_call_id = None
900
905
  function_name = None
901
906
 
902
- if finish_reason_type == "stop":
903
- # Handle remaining arguments
904
- latest_delta_len = 0
905
- if isinstance(call_item.parameters, str):
906
- latest_delta_len = len(call_item.parameters)
907
-
908
- expected_call = json.dumps(
909
- parser.detector.prev_tool_call_arr[index].get("arguments", {}),
910
- ensure_ascii=False,
911
- )
912
- actual_call = parser.detector.streamed_args_for_tool[index]
913
- if latest_delta_len > 0:
914
- actual_call = actual_call[:-latest_delta_len]
915
- remaining_call = expected_call.replace(actual_call, "", 1)
916
- call_item.parameters = remaining_call
917
- finish_reason_type = "tool_calls"
918
-
919
907
  tool_call = ToolCall(
920
908
  id=tool_call_id,
921
909
  index=call_item.tool_index,
@@ -928,19 +916,84 @@ class OpenAIServingChat(OpenAIServingBase):
928
916
  choice_data = ChatCompletionResponseStreamChoice(
929
917
  index=index,
930
918
  delta=DeltaMessage(tool_calls=[tool_call]),
931
- finish_reason=(
932
- None
933
- if request.stream_options and request.stream_options.include_usage
934
- else finish_reason_type
919
+ finish_reason=None,
920
+ )
921
+ chunk = ChatCompletionStreamResponse(
922
+ id=content["meta_info"]["id"],
923
+ created=int(time.time()),
924
+ choices=[choice_data],
925
+ model=request.model,
926
+ )
927
+ yield f"data: {chunk.model_dump_json()}\n\n"
928
+
929
+ def _check_for_unstreamed_tool_args(
930
+ self,
931
+ parser: FunctionCallParser,
932
+ content: Dict[str, Any],
933
+ request: ChatCompletionRequest,
934
+ index: int,
935
+ ) -> Optional[str]:
936
+ """
937
+ Check for any remaining tool call arguments that need to be streamed
938
+ when generation finishes. This ensures tool calls are properly completed
939
+ even if the model generates the final arguments in the last chunk.
940
+ """
941
+ # Only check if we have tool calls and the parser has tracked data
942
+ if (
943
+ not hasattr(parser.detector, "prev_tool_call_arr")
944
+ or not parser.detector.prev_tool_call_arr
945
+ ):
946
+ return None
947
+
948
+ if (
949
+ not hasattr(parser.detector, "streamed_args_for_tool")
950
+ or not parser.detector.streamed_args_for_tool
951
+ ):
952
+ return None
953
+
954
+ # Get the last tool call that was being processed
955
+ tool_index = len(parser.detector.prev_tool_call_arr) - 1
956
+ if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
957
+ return None
958
+
959
+ # Get expected vs actual arguments
960
+ expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
961
+ "arguments", {}
962
+ )
963
+ expected_call = json.dumps(expected_args, ensure_ascii=False)
964
+ actual_call = parser.detector.streamed_args_for_tool[tool_index]
965
+
966
+ # Check if there are remaining arguments to send
967
+ remaining_call = (
968
+ expected_call.replace(actual_call, "", 1)
969
+ if actual_call in expected_call
970
+ else ""
971
+ )
972
+
973
+ if remaining_call:
974
+ # Create tool call chunk with remaining arguments
975
+ tool_call = ToolCall(
976
+ id=None, # No ID for argument deltas
977
+ index=tool_index,
978
+ function=FunctionResponse(
979
+ name=None, # No name for argument deltas
980
+ arguments=remaining_call,
935
981
  ),
936
982
  )
983
+
984
+ choice_data = ChatCompletionResponseStreamChoice(
985
+ index=index,
986
+ delta=DeltaMessage(tool_calls=[tool_call]),
987
+ finish_reason=None, # Don't send finish_reason with this chunk
988
+ )
989
+
937
990
  chunk = ChatCompletionStreamResponse(
938
991
  id=content["meta_info"]["id"],
939
992
  created=int(time.time()),
940
993
  choices=[choice_data],
941
994
  model=request.model,
942
995
  )
943
- yield f"data: {chunk.model_dump_json()}\n\n", finish_reason_type
944
996
 
945
- if finish_reason_type == "stop":
946
- yield None, "tool_calls"
997
+ return f"data: {chunk.model_dump_json()}\n\n"
998
+
999
+ return None
@@ -165,6 +165,7 @@ class EBNFComposer:
165
165
  tool_call_separator: Optional[str] = None,
166
166
  call_rule_fmt: Optional[str] = None,
167
167
  key_value_rule_fmt: Optional[str] = None,
168
+ key_value_separator: str = ",",
168
169
  ):
169
170
  """
170
171
  Generalized EBNF builder for all detectors.
@@ -279,7 +280,11 @@ class EBNFComposer:
279
280
 
280
281
  # Add required properties joined by commas
281
282
  if required:
282
- rule_parts.append(' "," '.join(prop_kv_pairs[k] for k in required))
283
+ rule_parts.append(
284
+ f' "{key_value_separator}" '.join(
285
+ prop_kv_pairs[k] for k in required
286
+ )
287
+ )
283
288
 
284
289
  # Add optional properties with flexible ordering
285
290
  if optional:
@@ -292,13 +297,15 @@ class EBNFComposer:
292
297
  if j == i:
293
298
  opt_parts.append(prop_kv_pairs[optional[j]])
294
299
  else:
295
- opt_parts.append(f' ( "," {prop_kv_pairs[optional[j]]} )?')
300
+ opt_parts.append(
301
+ f' ( "{key_value_separator}" {prop_kv_pairs[optional[j]]} )?'
302
+ )
296
303
  opt_alternatives.append("".join(opt_parts))
297
304
 
298
305
  # Wrap with appropriate comma handling based on whether we have required properties
299
306
  if required:
300
307
  # Required properties exist, so optional group needs outer comma
301
- rule_parts.append(' ( "," ( ')
308
+ rule_parts.append(f' ( "{key_value_separator}" ( ')
302
309
  rule_parts.append(" | ".join(opt_alternatives))
303
310
  rule_parts.append(" ) )?")
304
311
  else:
@@ -10,6 +10,7 @@ from sglang.srt.entrypoints.openai.protocol import (
10
10
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
11
11
  from sglang.srt.function_call.core_types import ToolCallItem
12
12
  from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
13
+ from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
13
14
  from sglang.srt.function_call.kimik2_detector import KimiK2Detector
14
15
  from sglang.srt.function_call.llama32_detector import Llama32Detector
15
16
  from sglang.srt.function_call.mistral_detector import MistralDetector
@@ -37,6 +38,7 @@ class FunctionCallParser:
37
38
  "pythonic": PythonicDetector,
38
39
  "kimi_k2": KimiK2Detector,
39
40
  "qwen3_coder": Qwen3CoderDetector,
41
+ "glm45": Glm4MoeDetector,
40
42
  }
41
43
 
42
44
  def __init__(self, tools: List[Tool], tool_call_parser: str):
@@ -0,0 +1,164 @@
1
+ import ast
2
+ import json
3
+ import logging
4
+ import re
5
+ from typing import List
6
+
7
+ from sglang.srt.entrypoints.openai.protocol import Tool
8
+ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
9
+ from sglang.srt.function_call.core_types import (
10
+ StreamingParseResult,
11
+ StructureInfo,
12
+ _GetInfoFunc,
13
+ )
14
+ from sglang.srt.function_call.ebnf_composer import EBNFComposer
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def get_argument_type(func_name: str, arg_key: str, defined_tools: list):
20
+ name2tool = {tool.function.name: tool for tool in defined_tools}
21
+ if func_name not in name2tool:
22
+ return None
23
+ tool = name2tool[func_name]
24
+ if arg_key not in tool.function.parameters["properties"]:
25
+ return None
26
+ return tool.function.parameters["properties"][arg_key].get("type", None)
27
+
28
+
29
+ def parse_arguments(json_value):
30
+ try:
31
+ try:
32
+ parsed_value = json.loads(json_value)
33
+ except:
34
+ parsed_value = ast.literal_eval(json_value)
35
+ return parsed_value, True
36
+ except:
37
+ return json_value, False
38
+
39
+
40
+ class Glm4MoeDetector(BaseFormatDetector):
41
+ """
42
+ Detector for GLM-4.5 models.
43
+ Assumes function call format:
44
+ <tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
45
+ """
46
+
47
+ def __init__(self):
48
+ super().__init__()
49
+ self.bot_token = "<tool_call>"
50
+ self.eot_token = "</tool_call>"
51
+ self.func_call_regex = r"<tool_call>.*?</tool_call>"
52
+ self.func_detail_regex = r"<tool_call>([^\n]*)\n(.*)</tool_call>"
53
+ self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
54
+
55
+ def has_tool_call(self, text: str) -> bool:
56
+ """Check if the text contains a glm-4.5 format tool call."""
57
+ return self.bot_token in text
58
+
59
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
60
+ """
61
+ One-time parsing: Detects and parses tool calls in the provided text.
62
+
63
+ :param text: The complete text to parse.
64
+ :param tools: List of available tools.
65
+ :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
66
+ """
67
+ idx = text.find(self.bot_token)
68
+ normal_text = text[:idx].strip() if idx != -1 else text
69
+ if self.bot_token not in text:
70
+ return StreamingParseResult(normal_text=normal_text, calls=[])
71
+ match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
72
+ calls = []
73
+ try:
74
+ for match_result in match_result_list:
75
+ # Get function name
76
+ func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
77
+ func_name = func_detail.group(1)
78
+ func_args = func_detail.group(2)
79
+ pairs = re.findall(
80
+ r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>",
81
+ func_args,
82
+ re.DOTALL,
83
+ )
84
+ arguments = {}
85
+ for arg_key, arg_value in pairs:
86
+ arg_key = arg_key.strip()
87
+ arg_value = arg_value.strip()
88
+ arg_type = get_argument_type(func_name, arg_key, tools)
89
+ if arg_type != "string":
90
+ arg_value, is_good_json = parse_arguments(arg_value)
91
+ arguments[arg_key] = arg_value
92
+ # construct match_result for parse_base_json
93
+ match_result = {"name": func_name, "parameters": arguments}
94
+ calls.extend(self.parse_base_json(match_result, tools))
95
+ return StreamingParseResult(normal_text=normal_text, calls=calls)
96
+ except Exception as e:
97
+ logger.error(f"Error in detect_and_parse: {e}")
98
+ # return the normal text if parsing fails
99
+ return StreamingParseResult(normal_text=text)
100
+
101
+ def parse_streaming_increment(
102
+ self, new_text: str, tools: List[Tool]
103
+ ) -> StreamingParseResult:
104
+ """
105
+ Streaming incremental parsing tool calls for GLM-4.5 format.
106
+ """
107
+ self._buffer += new_text
108
+ current_text = self._buffer
109
+
110
+ start = current_text.find(self.bot_token)
111
+ if start == -1:
112
+ self._buffer = ""
113
+ if self.current_tool_id > 0:
114
+ current_text = ""
115
+ return StreamingParseResult(normal_text=current_text)
116
+ # find ensures we find the first self.eot_token so there will be at most one tool_call in current_text[:end+len(self.eot_token)
117
+ end = current_text.find(self.eot_token)
118
+ if end != -1:
119
+ # Initialize state if this is the first tool call
120
+ if self.current_tool_id == -1:
121
+ self.current_tool_id = 0
122
+ self.prev_tool_call_arr = []
123
+ self.streamed_args_for_tool = [""]
124
+ # Ensure we have enough entries in our tracking arrays
125
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
126
+ self.prev_tool_call_arr.append({})
127
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
128
+ self.streamed_args_for_tool.append("")
129
+ result = self.detect_and_parse(
130
+ current_text[: end + len(self.eot_token)], tools=tools
131
+ )
132
+ if result.calls:
133
+ self.prev_tool_call_arr[self.current_tool_id] = {
134
+ "name": result.calls[0].name,
135
+ "arguments": json.loads(result.calls[0].parameters),
136
+ }
137
+ self.streamed_args_for_tool[self.current_tool_id] = result.calls[
138
+ 0
139
+ ].parameters
140
+ result.calls[0].tool_index = self.current_tool_id
141
+ self.current_tool_id += 1
142
+ self._buffer = current_text[end + len(self.eot_token) :]
143
+ return result
144
+ normal_text = current_text[:start]
145
+ self._buffer = current_text[start:]
146
+ return StreamingParseResult(normal_text=normal_text)
147
+
148
+ def supports_structural_tag(self) -> bool:
149
+ return False
150
+
151
+ def structure_info(self) -> _GetInfoFunc:
152
+ raise NotImplementedError()
153
+
154
+ def build_ebnf(self, tools: List[Tool]):
155
+ return EBNFComposer.build_ebnf(
156
+ tools,
157
+ individual_call_start_token=self.bot_token,
158
+ individual_call_end_token=self.eot_token,
159
+ tool_call_separator="\\n",
160
+ function_format="xml",
161
+ call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
162
+ key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
163
+ key_value_separator="\\n",
164
+ )
@@ -148,4 +148,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
148
148
  function_format="xml",
149
149
  call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
150
150
  key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
151
+ key_value_separator="\\n",
151
152
  )