sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +6 -0
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +7 -7
  6. sglang/srt/disaggregation/decode.py +8 -3
  7. sglang/srt/disaggregation/mooncake/conn.py +43 -25
  8. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  9. sglang/srt/distributed/parallel_state.py +4 -2
  10. sglang/srt/entrypoints/context.py +3 -20
  11. sglang/srt/entrypoints/engine.py +13 -8
  12. sglang/srt/entrypoints/harmony_utils.py +2 -0
  13. sglang/srt/entrypoints/http_server.py +4 -5
  14. sglang/srt/entrypoints/openai/protocol.py +0 -9
  15. sglang/srt/entrypoints/openai/serving_chat.py +59 -265
  16. sglang/srt/entrypoints/openai/tool_server.py +4 -3
  17. sglang/srt/function_call/ebnf_composer.py +1 -0
  18. sglang/srt/function_call/function_call_parser.py +2 -0
  19. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  20. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  21. sglang/srt/function_call/kimik2_detector.py +3 -3
  22. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  23. sglang/srt/jinja_template_utils.py +6 -0
  24. sglang/srt/layers/attention/aiter_backend.py +370 -107
  25. sglang/srt/layers/attention/ascend_backend.py +3 -0
  26. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  27. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  28. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  29. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  30. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  31. sglang/srt/layers/attention/vision.py +9 -1
  32. sglang/srt/layers/attention/wave_backend.py +627 -0
  33. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  34. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  35. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  36. sglang/srt/layers/communicator.py +8 -10
  37. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  38. sglang/srt/layers/linear.py +1 -0
  39. sglang/srt/layers/moe/cutlass_moe.py +11 -16
  40. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  41. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  42. sglang/srt/layers/moe/ep_moe/layer.py +60 -2
  43. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
  46. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  47. sglang/srt/layers/moe/topk.py +4 -1
  48. sglang/srt/layers/quantization/__init__.py +5 -3
  49. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  50. sglang/srt/layers/quantization/fp8_utils.py +22 -10
  51. sglang/srt/layers/quantization/modelopt_quant.py +6 -11
  52. sglang/srt/layers/quantization/mxfp4.py +4 -1
  53. sglang/srt/layers/quantization/w4afp8.py +20 -11
  54. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  55. sglang/srt/layers/rotary_embedding.py +281 -2
  56. sglang/srt/lora/backend/base_backend.py +3 -23
  57. sglang/srt/lora/layers.py +60 -114
  58. sglang/srt/lora/lora.py +17 -62
  59. sglang/srt/lora/lora_manager.py +12 -48
  60. sglang/srt/lora/lora_registry.py +20 -9
  61. sglang/srt/lora/mem_pool.py +20 -63
  62. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  63. sglang/srt/lora/utils.py +25 -58
  64. sglang/srt/managers/cache_controller.py +21 -29
  65. sglang/srt/managers/detokenizer_manager.py +1 -1
  66. sglang/srt/managers/io_struct.py +6 -6
  67. sglang/srt/managers/mm_utils.py +1 -2
  68. sglang/srt/managers/multimodal_processor.py +1 -1
  69. sglang/srt/managers/schedule_batch.py +35 -20
  70. sglang/srt/managers/schedule_policy.py +6 -6
  71. sglang/srt/managers/scheduler.py +15 -7
  72. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  73. sglang/srt/managers/tokenizer_manager.py +25 -26
  74. sglang/srt/mem_cache/allocator.py +61 -87
  75. sglang/srt/mem_cache/hicache_storage.py +1 -1
  76. sglang/srt/mem_cache/hiradix_cache.py +34 -24
  77. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  78. sglang/srt/mem_cache/memory_pool_host.py +33 -35
  79. sglang/srt/mem_cache/radix_cache.py +2 -5
  80. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  81. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  82. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  83. sglang/srt/model_executor/cuda_graph_runner.py +22 -3
  84. sglang/srt/model_executor/forward_batch_info.py +26 -5
  85. sglang/srt/model_executor/model_runner.py +129 -35
  86. sglang/srt/model_loader/loader.py +18 -6
  87. sglang/srt/models/deepseek_v2.py +74 -35
  88. sglang/srt/models/gemma2.py +0 -34
  89. sglang/srt/models/gemma3n_mm.py +8 -9
  90. sglang/srt/models/glm4.py +6 -0
  91. sglang/srt/models/glm4_moe.py +9 -9
  92. sglang/srt/models/glm4v.py +589 -0
  93. sglang/srt/models/glm4v_moe.py +400 -0
  94. sglang/srt/models/gpt_oss.py +136 -19
  95. sglang/srt/models/granite.py +0 -25
  96. sglang/srt/models/llama.py +0 -25
  97. sglang/srt/models/llama4.py +1 -1
  98. sglang/srt/models/qwen2_5_vl.py +7 -3
  99. sglang/srt/models/qwen2_audio.py +10 -9
  100. sglang/srt/models/qwen3.py +0 -24
  101. sglang/srt/models/registry.py +1 -1
  102. sglang/srt/models/torch_native_llama.py +0 -24
  103. sglang/srt/multimodal/processors/base_processor.py +23 -13
  104. sglang/srt/multimodal/processors/glm4v.py +132 -0
  105. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  106. sglang/srt/reasoning_parser.py +316 -0
  107. sglang/srt/server_args.py +115 -139
  108. sglang/srt/speculative/eagle_worker.py +16 -0
  109. sglang/srt/two_batch_overlap.py +12 -4
  110. sglang/srt/utils.py +3 -3
  111. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  112. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  113. sglang/test/doc_patch.py +59 -0
  114. sglang/test/few_shot_gsm8k.py +1 -1
  115. sglang/test/few_shot_gsm8k_engine.py +1 -1
  116. sglang/test/run_eval.py +4 -1
  117. sglang/test/simple_eval_common.py +6 -0
  118. sglang/test/simple_eval_gpqa.py +2 -0
  119. sglang/test/test_fp4_moe.py +118 -36
  120. sglang/utils.py +1 -1
  121. sglang/version.py +1 -1
  122. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +26 -30
  123. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +127 -115
  124. sglang/lang/backend/__init__.py +0 -0
  125. sglang/srt/function_call/harmony_tool_parser.py +0 -130
  126. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  127. /sglang/{api.py → lang/api.py} +0 -0
  128. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  129. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  130. {sglang-0.5.0rc0.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -7,18 +7,8 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Union
7
7
 
8
8
  from fastapi import Request
9
9
  from fastapi.responses import ORJSONResponse, StreamingResponse
10
- from openai_harmony import Message as OpenAIMessage
11
10
 
12
11
  from sglang.srt.conversation import generate_chat_conv
13
- from sglang.srt.entrypoints.harmony_utils import (
14
- get_developer_message,
15
- get_stop_tokens_for_assistant_actions,
16
- get_streamable_parser_for_assistant,
17
- get_system_message,
18
- parse_chat_input,
19
- parse_output_into_messages,
20
- render_for_completion,
21
- )
22
12
  from sglang.srt.entrypoints.openai.protocol import (
23
13
  ChatCompletionRequest,
24
14
  ChatCompletionResponse,
@@ -57,30 +47,12 @@ class OpenAIServingChat(OpenAIServingBase):
57
47
  """Handler for /v1/chat/completions requests"""
58
48
 
59
49
  def __init__(
60
- self, tokenizer_manager: TokenizerManager, template_manager: TemplateManager
50
+ self,
51
+ tokenizer_manager: TokenizerManager,
52
+ template_manager: TemplateManager,
61
53
  ):
62
54
  super().__init__(tokenizer_manager)
63
55
  self.template_manager = template_manager
64
- self.use_harmony = (
65
- self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
66
- )
67
-
68
- if self.use_harmony:
69
- from sglang.srt.function_call.harmony_tool_parser import (
70
- HarmonyToolCallParser,
71
- )
72
-
73
- self.harmony_tool_parser = HarmonyToolCallParser()
74
-
75
- # NOTE While OpenAI's chat completion API supports browsing
76
- # for some models, currently vLLM doesn't support it. Please use the
77
- # Responses API instead.
78
- self.supports_browsing = False
79
- self.browser_tool = None
80
- # NOTE: Chat completion API does not support code interpreter.
81
- # Please use the Responses API instead.
82
- self.supports_code_interpreter = False
83
- self.python_tool = None
84
56
 
85
57
  def _request_id_prefix(self) -> str:
86
58
  return "chatcmpl-"
@@ -97,6 +69,18 @@ class OpenAIServingChat(OpenAIServingBase):
97
69
  ):
98
70
  return "Tools cannot be empty if tool choice is set to required."
99
71
 
72
+ max_output_tokens = request.max_completion_tokens or request.max_tokens
73
+ server_context_length = self.tokenizer_manager.server_args.context_length
74
+ if (
75
+ max_output_tokens
76
+ and server_context_length
77
+ and max_output_tokens > server_context_length
78
+ ):
79
+ return (
80
+ f"max_completion_tokens is too large: {max_output_tokens}."
81
+ f"This model supports at most {server_context_length} completion tokens."
82
+ )
83
+
100
84
  return None
101
85
 
102
86
  def _convert_to_internal_request(
@@ -107,66 +91,43 @@ class OpenAIServingChat(OpenAIServingBase):
107
91
  is_multimodal = self.tokenizer_manager.model_config.is_multimodal
108
92
 
109
93
  # Process messages and apply chat template
110
- if not self.use_harmony:
111
- processed_messages = self._process_messages(request, is_multimodal)
112
-
113
- # Build sampling parameters
114
- sampling_params = self._build_sampling_params(
115
- request,
116
- processed_messages.stop,
117
- processed_messages.tool_call_constraint,
118
- )
94
+ processed_messages = self._process_messages(request, is_multimodal)
119
95
 
120
- # Handle single vs multiple requests
121
- if is_multimodal:
122
- prompt_kwargs = {"text": processed_messages.prompt}
123
- else:
124
- if isinstance(processed_messages.prompt_ids, str):
125
- prompt_kwargs = {"text": processed_messages.prompt_ids}
126
- else:
127
- prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
128
-
129
- adapted_request = GenerateReqInput(
130
- **prompt_kwargs,
131
- image_data=processed_messages.image_data,
132
- video_data=processed_messages.video_data,
133
- audio_data=processed_messages.audio_data,
134
- sampling_params=sampling_params,
135
- return_logprob=request.logprobs,
136
- logprob_start_len=-1,
137
- top_logprobs_num=request.top_logprobs or 0,
138
- stream=request.stream,
139
- return_text_in_logprobs=True,
140
- modalities=processed_messages.modalities,
141
- lora_path=request.lora_path,
142
- bootstrap_host=request.bootstrap_host,
143
- bootstrap_port=request.bootstrap_port,
144
- bootstrap_room=request.bootstrap_room,
145
- return_hidden_states=request.return_hidden_states,
146
- rid=request.rid,
147
- )
96
+ # Build sampling parameters
97
+ sampling_params = self._build_sampling_params(
98
+ request,
99
+ processed_messages.stop,
100
+ processed_messages.tool_call_constraint,
101
+ )
102
+
103
+ # Handle single vs multiple requests
104
+ if is_multimodal:
105
+ prompt_kwargs = {"text": processed_messages.prompt}
148
106
  else:
149
- processed_messages, prompt_ids = self._make_request_with_harmony(request)
150
-
151
- adapted_request = GenerateReqInput(
152
- input_ids=prompt_ids,
153
- sampling_params=self._build_sampling_params(
154
- request,
155
- request.stop,
156
- tool_call_constraint=None,
157
- ),
158
- stream=request.stream,
159
- return_logprob=request.logprobs,
160
- logprob_start_len=-1,
161
- top_logprobs_num=request.top_logprobs or 0,
162
- return_text_in_logprobs=True,
163
- lora_path=request.lora_path,
164
- bootstrap_host=request.bootstrap_host,
165
- bootstrap_port=request.bootstrap_port,
166
- bootstrap_room=request.bootstrap_room,
167
- return_hidden_states=request.return_hidden_states,
168
- rid=request.rid,
169
- )
107
+ if isinstance(processed_messages.prompt_ids, str):
108
+ prompt_kwargs = {"text": processed_messages.prompt_ids}
109
+ else:
110
+ prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
111
+
112
+ adapted_request = GenerateReqInput(
113
+ **prompt_kwargs,
114
+ image_data=processed_messages.image_data,
115
+ video_data=processed_messages.video_data,
116
+ audio_data=processed_messages.audio_data,
117
+ sampling_params=sampling_params,
118
+ return_logprob=request.logprobs,
119
+ logprob_start_len=-1,
120
+ top_logprobs_num=request.top_logprobs or 0,
121
+ stream=request.stream,
122
+ return_text_in_logprobs=True,
123
+ modalities=processed_messages.modalities,
124
+ lora_path=request.lora_path,
125
+ bootstrap_host=request.bootstrap_host,
126
+ bootstrap_port=request.bootstrap_port,
127
+ bootstrap_room=request.bootstrap_room,
128
+ return_hidden_states=request.return_hidden_states,
129
+ rid=request.rid,
130
+ )
170
131
 
171
132
  return adapted_request, request
172
133
 
@@ -251,14 +212,15 @@ class OpenAIServingChat(OpenAIServingBase):
251
212
  tokenize=True,
252
213
  add_generation_prompt=True,
253
214
  tools=tools,
215
+ reasoning_effort=request.reasoning_effort,
254
216
  **(
255
217
  request.chat_template_kwargs if request.chat_template_kwargs else {}
256
218
  ),
257
219
  )
258
220
  except Exception:
259
- # This except branch will be triggered when the chosen model
260
- # has a different tools input format that is not compatible
261
- # with openAI's apply_chat_template tool_call format, like Mistral.
221
+ # This except branch will be triggered when the chosen model
222
+ # has a different tools input format that is not compatible
223
+ # with openAI's apply_chat_template tool_call format, like Mistral.
262
224
  tools = (
263
225
  [t if "function" in t else {"function": t} for t in tools]
264
226
  if tools
@@ -269,6 +231,7 @@ class OpenAIServingChat(OpenAIServingBase):
269
231
  tokenize=True,
270
232
  add_generation_prompt=True,
271
233
  tools=tools,
234
+ reasoning_effort=request.reasoning_effort,
272
235
  **(
273
236
  request.chat_template_kwargs if request.chat_template_kwargs else {}
274
237
  ),
@@ -459,12 +422,6 @@ class OpenAIServingChat(OpenAIServingBase):
459
422
  cached_tokens = {}
460
423
  hidden_states = {}
461
424
 
462
- # Harmony tracking
463
- if self.use_harmony:
464
- harmony_parsers = [
465
- get_streamable_parser_for_assistant() for _ in range(request.n)
466
- ]
467
-
468
425
  try:
469
426
  async for content in self.tokenizer_manager.generate_request(
470
427
  adapted_request, raw_request
@@ -511,58 +468,14 @@ class OpenAIServingChat(OpenAIServingBase):
511
468
  )
512
469
  yield f"data: {chunk.model_dump_json()}\n\n"
513
470
 
514
- # Process content delta
515
- if self.use_harmony:
516
- harmony_parser = harmony_parsers[index]
517
-
518
- new_token_ids = content["output_ids"]
519
- for token_id in new_token_ids:
520
- harmony_parser.process(token_id)
521
-
522
- is_final = harmony_parser.current_channel == "final"
523
- is_analysis = harmony_parser.current_channel == "analysis"
524
- delta = harmony_parser.last_content_delta or ""
525
-
526
- if is_analysis:
527
- choice_data = ChatCompletionResponseStreamChoice(
528
- index=index,
529
- delta=DeltaMessage(reasoning_content=delta),
530
- finish_reason=None,
531
- )
532
- chunk = ChatCompletionStreamResponse(
533
- id=content["meta_info"]["id"],
534
- created=int(time.time()),
535
- choices=[choice_data],
536
- model=request.model,
537
- )
538
- yield f"data: {chunk.model_dump_json()}\n\n"
539
- continue
540
-
541
- choice_data = ChatCompletionResponseStreamChoice(
542
- index=index,
543
- delta=DeltaMessage(content=delta if delta else None),
544
- finish_reason=None,
545
- matched_stop=None,
546
- logprobs=choice_logprobs,
547
- )
548
- chunk = ChatCompletionStreamResponse(
549
- id=content["meta_info"]["id"],
550
- created=int(time.time()),
551
- choices=[choice_data],
552
- model=request.model,
553
- )
554
- yield f"data: {chunk.model_dump_json()}\n\n"
555
- continue
556
- else:
557
- stream_buffer = stream_buffers.get(index, "")
558
- delta = content["text"][len(stream_buffer) :]
559
- stream_buffers[index] = stream_buffer + delta
471
+ stream_buffer = stream_buffers.get(index, "")
472
+ delta = content["text"][len(stream_buffer) :]
473
+ stream_buffers[index] = stream_buffer + delta
560
474
 
561
475
  # Handle reasoning content
562
476
  if (
563
477
  self.tokenizer_manager.server_args.reasoning_parser
564
478
  and request.separate_reasoning
565
- and not self.use_harmony
566
479
  ):
567
480
  reasoning_text, delta = self._process_reasoning_stream(
568
481
  index, delta, reasoning_parser_dict, content, request
@@ -581,27 +494,8 @@ class OpenAIServingChat(OpenAIServingBase):
581
494
  )
582
495
  yield f"data: {chunk.model_dump_json()}\n\n"
583
496
 
584
- if self.use_harmony and not is_final:
585
- choice_data = ChatCompletionResponseStreamChoice(
586
- index=index,
587
- delta=DeltaMessage(reasoning_content=delta),
588
- finish_reason=None,
589
- )
590
- chunk = ChatCompletionStreamResponse(
591
- id=content["meta_info"]["id"],
592
- created=int(time.time()),
593
- choices=[choice_data],
594
- model=request.model,
595
- )
596
- yield f"data: {chunk.model_dump_json()}\n\n"
597
-
598
497
  # Handle tool calls
599
- # TODO: support tool call parsing for harmony
600
- if (
601
- request.tool_choice != "none"
602
- and request.tools
603
- and not self.use_harmony
604
- ):
498
+ if request.tool_choice != "none" and request.tools:
605
499
  async for chunk in self._process_tool_call_stream(
606
500
  index,
607
501
  delta,
@@ -765,76 +659,6 @@ class OpenAIServingChat(OpenAIServingBase):
765
659
 
766
660
  finish_reason = ret_item["meta_info"]["finish_reason"]
767
661
  text = ret_item["text"]
768
- output_ids = ret_item["output_ids"]
769
-
770
- if self.use_harmony:
771
- parser = parse_output_into_messages(output_ids)
772
- output_msgs = parser.messages
773
- if len(output_msgs) == 0:
774
- # The generation has stopped during reasoning.
775
- is_tool_call = False
776
- reasoning_content = parser.current_content
777
- final_content = None
778
- elif len(output_msgs) == 1:
779
- # The generation has stopped during final message.
780
- is_tool_call = False
781
- reasoning_content = output_msgs[0].content[0].text
782
- final_content = parser.current_content
783
- else:
784
- if len(output_msgs) != 2:
785
- raise ValueError(
786
- "Expected 2 output messages (reasoning and final), "
787
- f"but got {len(output_msgs)}."
788
- )
789
- reasoning_msg, final_msg = output_msgs
790
- reasoning_content = reasoning_msg.content[0].text
791
- final_content = final_msg.content[0].text
792
- is_tool_call = final_msg.recipient is not None
793
-
794
- if is_tool_call:
795
- # Extract tool call information from final message
796
- tool_call = (
797
- self.harmony_tool_parser.extract_tool_calls_from_message(
798
- final_msg
799
- )
800
- )
801
- tool_calls = [tool_call] if tool_call else []
802
-
803
- message = ChatMessage(
804
- role="assistant",
805
- reasoning_content=reasoning_content,
806
- content=None, # Tool calls don't have regular content
807
- tool_calls=tool_calls,
808
- )
809
- else:
810
- # Normal message
811
- message = ChatMessage(
812
- role="assistant",
813
- reasoning_content=reasoning_content,
814
- content=final_content,
815
- )
816
-
817
- if is_tool_call:
818
- finish_reason_type = "tool_calls"
819
- elif finish_reason:
820
- finish_reason_type = (
821
- finish_reason["type"] if finish_reason else "stop"
822
- )
823
- else:
824
- finish_reason_type = "stop"
825
- choice_data = ChatCompletionResponseChoice(
826
- index=idx,
827
- message=message,
828
- logprobs=choice_logprobs,
829
- finish_reason=finish_reason_type,
830
- matched_stop=(
831
- finish_reason["matched"]
832
- if finish_reason and "matched" in finish_reason
833
- else None
834
- ),
835
- )
836
- choices.append(choice_data)
837
- continue
838
662
 
839
663
  # Handle reasoning content
840
664
  reasoning_text = None
@@ -1184,33 +1008,3 @@ class OpenAIServingChat(OpenAIServingBase):
1184
1008
  return f"data: {chunk.model_dump_json()}\n\n"
1185
1009
 
1186
1010
  return None
1187
-
1188
- def _make_request_with_harmony(
1189
- self,
1190
- request: ChatCompletionRequest,
1191
- ):
1192
- messages: list[OpenAIMessage] = []
1193
-
1194
- # Add system message.
1195
- # In Chat Completion API, browsing is enabled by default if the model
1196
- # supports it.
1197
- assert not self.supports_browsing
1198
- assert not self.supports_code_interpreter
1199
- sys_msg = get_system_message(
1200
- reasoning_effort=request.reasoning_effort,
1201
- browser_description=None,
1202
- python_description=None,
1203
- )
1204
- messages.append(sys_msg)
1205
-
1206
- # Add developer message.
1207
- dev_msg = get_developer_message()
1208
- messages.append(dev_msg)
1209
-
1210
- # Add user message.
1211
- for chat_msg in request.messages:
1212
- messages.append(parse_chat_input(chat_msg))
1213
-
1214
- # Render prompt token ids.
1215
- prompt_token_ids = render_for_completion(messages)
1216
- return messages, prompt_token_ids
@@ -5,16 +5,17 @@ from abc import ABC, abstractmethod
5
5
  from contextlib import AbstractAsyncContextManager, asynccontextmanager
6
6
  from typing import Any
7
7
 
8
- logger = logging.getLogger(__name__)
9
8
  try:
10
9
  from mcp import ClientSession
11
10
  from mcp.client.sse import sse_client
12
11
  from mcp.types import ListToolsResult
13
- except ImportError:
14
- logger.warning("Ignoring mcp import error")
12
+ except ImportError as e:
13
+ ClientSession = sse_client = ListToolsResult = e
15
14
 
16
15
  from openai_harmony import ToolDescription, ToolNamespaceConfig
17
16
 
17
+ logger = logging.getLogger(__name__)
18
+
18
19
 
19
20
  async def list_server_and_tools(server_url: str):
20
21
 
@@ -316,6 +316,7 @@ class EBNFComposer:
316
316
 
317
317
  combined_args = "".join(rule_parts)
318
318
  arguments_rule = args_template.format(arg_rules=combined_args)
319
+ arguments_rule = arguments_rule or '""'
319
320
 
320
321
  # Add the function call rule and its arguments rule
321
322
  ebnf_lines.append(
@@ -11,6 +11,7 @@ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
11
11
  from sglang.srt.function_call.core_types import ToolCallItem
12
12
  from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
13
13
  from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
14
+ from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
14
15
  from sglang.srt.function_call.kimik2_detector import KimiK2Detector
15
16
  from sglang.srt.function_call.llama32_detector import Llama32Detector
16
17
  from sglang.srt.function_call.mistral_detector import MistralDetector
@@ -41,6 +42,7 @@ class FunctionCallParser:
41
42
  "qwen3_coder": Qwen3CoderDetector,
42
43
  "glm45": Glm4MoeDetector,
43
44
  "step3": Step3Detector,
45
+ "gpt-oss": GptOssDetector,
44
46
  }
45
47
 
46
48
  def __init__(self, tools: List[Tool], tool_call_parser: str):
@@ -158,7 +158,7 @@ class Glm4MoeDetector(BaseFormatDetector):
158
158
  individual_call_end_token=self.eot_token,
159
159
  tool_call_separator="\\n",
160
160
  function_format="xml",
161
- call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
161
+ call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
162
162
  key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
163
163
  key_value_separator="\\n",
164
164
  )