sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +119 -17
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +42 -7
  6. sglang/srt/conversation.py +9 -5
  7. sglang/srt/disaggregation/base/conn.py +5 -2
  8. sglang/srt/disaggregation/decode.py +14 -4
  9. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  10. sglang/srt/disaggregation/mooncake/conn.py +286 -160
  11. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  12. sglang/srt/disaggregation/prefill.py +2 -0
  13. sglang/srt/distributed/parallel_state.py +15 -11
  14. sglang/srt/entrypoints/context.py +227 -0
  15. sglang/srt/entrypoints/engine.py +15 -9
  16. sglang/srt/entrypoints/harmony_utils.py +372 -0
  17. sglang/srt/entrypoints/http_server.py +74 -4
  18. sglang/srt/entrypoints/openai/protocol.py +218 -1
  19. sglang/srt/entrypoints/openai/serving_chat.py +41 -11
  20. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  21. sglang/srt/entrypoints/openai/tool_server.py +175 -0
  22. sglang/srt/entrypoints/tool.py +87 -0
  23. sglang/srt/eplb/expert_location.py +5 -1
  24. sglang/srt/function_call/ebnf_composer.py +1 -0
  25. sglang/srt/function_call/function_call_parser.py +2 -0
  26. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  27. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  28. sglang/srt/function_call/kimik2_detector.py +3 -3
  29. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  30. sglang/srt/hf_transformers_utils.py +30 -3
  31. sglang/srt/jinja_template_utils.py +14 -1
  32. sglang/srt/layers/attention/aiter_backend.py +375 -115
  33. sglang/srt/layers/attention/ascend_backend.py +3 -0
  34. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  35. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  36. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  37. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  38. sglang/srt/layers/attention/triton_backend.py +85 -14
  39. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  41. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  42. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  43. sglang/srt/layers/attention/vision.py +22 -6
  44. sglang/srt/layers/attention/wave_backend.py +627 -0
  45. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  46. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  47. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  48. sglang/srt/layers/communicator.py +29 -14
  49. sglang/srt/layers/dp_attention.py +12 -0
  50. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  51. sglang/srt/layers/linear.py +3 -7
  52. sglang/srt/layers/moe/cutlass_moe.py +12 -3
  53. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  54. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  55. sglang/srt/layers/moe/ep_moe/layer.py +135 -73
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  59. sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
  60. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  61. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  62. sglang/srt/layers/moe/topk.py +16 -4
  63. sglang/srt/layers/moe/utils.py +16 -0
  64. sglang/srt/layers/quantization/__init__.py +27 -3
  65. sglang/srt/layers/quantization/fp4.py +557 -0
  66. sglang/srt/layers/quantization/fp8.py +3 -6
  67. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  68. sglang/srt/layers/quantization/fp8_utils.py +51 -10
  69. sglang/srt/layers/quantization/modelopt_quant.py +258 -68
  70. sglang/srt/layers/quantization/mxfp4.py +654 -0
  71. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  72. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  73. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  74. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  75. sglang/srt/layers/quantization/quark/utils.py +107 -0
  76. sglang/srt/layers/quantization/unquant.py +60 -6
  77. sglang/srt/layers/quantization/w4afp8.py +21 -12
  78. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  79. sglang/srt/layers/rotary_embedding.py +506 -3
  80. sglang/srt/layers/utils.py +9 -0
  81. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  82. sglang/srt/lora/backend/base_backend.py +3 -23
  83. sglang/srt/lora/layers.py +60 -114
  84. sglang/srt/lora/lora.py +17 -62
  85. sglang/srt/lora/lora_manager.py +82 -62
  86. sglang/srt/lora/lora_registry.py +23 -11
  87. sglang/srt/lora/mem_pool.py +63 -68
  88. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  89. sglang/srt/lora/utils.py +25 -58
  90. sglang/srt/managers/cache_controller.py +75 -58
  91. sglang/srt/managers/detokenizer_manager.py +1 -1
  92. sglang/srt/managers/io_struct.py +20 -8
  93. sglang/srt/managers/mm_utils.py +6 -13
  94. sglang/srt/managers/multimodal_processor.py +1 -1
  95. sglang/srt/managers/schedule_batch.py +61 -25
  96. sglang/srt/managers/schedule_policy.py +6 -6
  97. sglang/srt/managers/scheduler.py +41 -19
  98. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  99. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  100. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  101. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  102. sglang/srt/managers/template_manager.py +35 -1
  103. sglang/srt/managers/tokenizer_manager.py +47 -30
  104. sglang/srt/managers/tp_worker.py +3 -0
  105. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  106. sglang/srt/mem_cache/allocator.py +61 -87
  107. sglang/srt/mem_cache/hicache_storage.py +1 -1
  108. sglang/srt/mem_cache/hiradix_cache.py +80 -22
  109. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  110. sglang/srt/mem_cache/memory_pool_host.py +34 -36
  111. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  112. sglang/srt/mem_cache/radix_cache.py +2 -5
  113. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  114. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  115. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  116. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  117. sglang/srt/model_executor/cuda_graph_runner.py +29 -9
  118. sglang/srt/model_executor/forward_batch_info.py +61 -19
  119. sglang/srt/model_executor/model_runner.py +148 -37
  120. sglang/srt/model_loader/loader.py +18 -6
  121. sglang/srt/model_loader/weight_utils.py +10 -0
  122. sglang/srt/models/bailing_moe.py +425 -0
  123. sglang/srt/models/deepseek_v2.py +137 -59
  124. sglang/srt/models/ernie4.py +426 -0
  125. sglang/srt/models/ernie4_eagle.py +203 -0
  126. sglang/srt/models/gemma2.py +0 -34
  127. sglang/srt/models/gemma3n_mm.py +38 -0
  128. sglang/srt/models/glm4.py +6 -0
  129. sglang/srt/models/glm4_moe.py +28 -16
  130. sglang/srt/models/glm4v.py +589 -0
  131. sglang/srt/models/glm4v_moe.py +400 -0
  132. sglang/srt/models/gpt_oss.py +1251 -0
  133. sglang/srt/models/granite.py +0 -25
  134. sglang/srt/models/llama.py +0 -25
  135. sglang/srt/models/llama4.py +1 -1
  136. sglang/srt/models/qwen2.py +6 -0
  137. sglang/srt/models/qwen2_5_vl.py +7 -3
  138. sglang/srt/models/qwen2_audio.py +10 -9
  139. sglang/srt/models/qwen2_moe.py +6 -0
  140. sglang/srt/models/qwen3.py +0 -24
  141. sglang/srt/models/qwen3_moe.py +32 -6
  142. sglang/srt/models/registry.py +1 -1
  143. sglang/srt/models/step3_vl.py +9 -0
  144. sglang/srt/models/torch_native_llama.py +0 -24
  145. sglang/srt/models/transformers.py +2 -5
  146. sglang/srt/multimodal/processors/base_processor.py +23 -13
  147. sglang/srt/multimodal/processors/glm4v.py +132 -0
  148. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  149. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  150. sglang/srt/reasoning_parser.py +332 -37
  151. sglang/srt/server_args.py +186 -75
  152. sglang/srt/speculative/eagle_worker.py +16 -0
  153. sglang/srt/two_batch_overlap.py +169 -9
  154. sglang/srt/utils.py +41 -5
  155. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  156. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  157. sglang/test/doc_patch.py +59 -0
  158. sglang/test/few_shot_gsm8k.py +1 -1
  159. sglang/test/few_shot_gsm8k_engine.py +1 -1
  160. sglang/test/run_eval.py +4 -1
  161. sglang/test/runners.py +2 -2
  162. sglang/test/simple_eval_common.py +6 -0
  163. sglang/test/simple_eval_gpqa.py +2 -0
  164. sglang/test/test_fp4_moe.py +118 -36
  165. sglang/test/test_utils.py +1 -1
  166. sglang/utils.py +1 -1
  167. sglang/version.py +1 -1
  168. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
  169. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
  170. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  171. /sglang/{api.py → lang/api.py} +0 -0
  172. /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
  173. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  174. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  175. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,7 @@ class KimiK2Detector(BaseFormatDetector):
24
24
  Format Structure:
25
25
  ```
26
26
  <|tool_calls_section_begin|>
27
- <|tool_call_begin|>functions.{func_name}:{index} <|tool_call_argument_begin|>{json_args}<|tool_call_end|>
27
+ <|tool_call_begin|>functions.{func_name}:{index}<|tool_call_argument_begin|>{json_args}<|tool_call_end|>
28
28
  <|tool_calls_section_end|>
29
29
  ```
30
30
 
@@ -219,7 +219,7 @@ class KimiK2Detector(BaseFormatDetector):
219
219
 
220
220
  def get_info(name: str) -> StructureInfo:
221
221
  return StructureInfo(
222
- begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0 <|tool_call_argument_begin|>",
222
+ begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0<|tool_call_argument_begin|>",
223
223
  end="<|tool_call_end|><|tool_calls_section_end|>",
224
224
  trigger="<|tool_calls_section_begin|>",
225
225
  )
@@ -240,6 +240,6 @@ class KimiK2Detector(BaseFormatDetector):
240
240
  sequence_start_token=self.bot_token,
241
241
  sequence_end_token=self.eot_token,
242
242
  tool_call_separator="",
243
- call_rule_fmt='"<|tool_call_begin|>functions.{name}:" [0-9]+ " <|tool_call_argument_begin|>" {arguments_rule} "<|tool_call_end|>"',
243
+ call_rule_fmt='"<|tool_call_begin|>functions.{name}:"[0-9]+"<|tool_call_argument_begin|>"{arguments_rule}"<|tool_call_end|>"',
244
244
  function_format="json",
245
245
  )
@@ -57,6 +57,15 @@ class Qwen3CoderDetector(BaseFormatDetector):
57
57
  )
58
58
  self._buf: str = ""
59
59
 
60
+ # Streaming state variables
61
+ self._current_function_name: str = ""
62
+ self._current_parameters: Dict[str, Any] = {}
63
+ self._streamed_parameters: Dict[str, str] = (
64
+ {}
65
+ ) # Track what parameter content we've streamed
66
+ self._in_tool_call: bool = False
67
+ self._function_name_sent: bool = False
68
+
60
69
  def has_tool_call(self, text: str) -> bool:
61
70
  return self.tool_call_start_token in text
62
71
 
@@ -70,23 +79,224 @@ class Qwen3CoderDetector(BaseFormatDetector):
70
79
  self._buf += new_text
71
80
  normal = ""
72
81
  calls: List[ToolCallItem] = []
82
+
83
+ # Build tool indices for validation
84
+ if not hasattr(self, "_tool_indices"):
85
+ self._tool_indices = self._get_tool_indices(tools)
86
+
73
87
  while True:
74
- if self.tool_call_start_token not in self._buf:
88
+ # If we're not in a tool call and don't see a start token, return normal text
89
+ if not self._in_tool_call and self.tool_call_start_token not in self._buf:
75
90
  normal += self._buf
76
91
  self._buf = ""
77
92
  break
78
- s = self._buf.find(self.tool_call_start_token)
79
- if s > 0:
93
+
94
+ # Look for tool call start
95
+ if not self._in_tool_call:
96
+ s = self._buf.find(self.tool_call_start_token)
97
+ if s == -1:
98
+ normal += self._buf
99
+ self._buf = ""
100
+ break
101
+
80
102
  normal += self._buf[:s]
81
103
  self._buf = self._buf[s:]
82
- e = self._buf.find(self.tool_call_end_token)
83
- if e == -1:
84
- break
85
- block = self._buf[: e + len(self.tool_call_end_token)]
86
- self._buf = self._buf[e + len(self.tool_call_end_token) :]
87
- calls.extend(self._parse_block(block, tools))
104
+
105
+ self._in_tool_call = True
106
+ self._function_name_sent = False
107
+ self._current_function_name = ""
108
+ self._current_parameters = {}
109
+ self._streamed_parameters = {}
110
+
111
+ # Remove the start token
112
+ self._buf = self._buf[len(self.tool_call_start_token) :]
113
+ continue
114
+
115
+ # We're in a tool call, try to parse function name if not sent yet
116
+ if not self._function_name_sent:
117
+ # Look for function name pattern: <function=name>
118
+ function_match = re.search(r"<function=([^>]+)>", self._buf)
119
+ if function_match:
120
+ function_name = function_match.group(1).strip()
121
+
122
+ # Validate function name
123
+ if function_name in self._tool_indices:
124
+ self._current_function_name = function_name
125
+ self._function_name_sent = True
126
+
127
+ # Initialize tool call tracking
128
+ if self.current_tool_id == -1:
129
+ self.current_tool_id = 0
130
+
131
+ # Ensure tracking arrays are large enough
132
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
133
+ self.prev_tool_call_arr.append({})
134
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
135
+ self.streamed_args_for_tool.append("")
136
+
137
+ # Store tool call info
138
+ self.prev_tool_call_arr[self.current_tool_id] = {
139
+ "name": function_name,
140
+ "arguments": {},
141
+ }
142
+
143
+ # Send tool name with empty parameters
144
+ calls.append(
145
+ ToolCallItem(
146
+ tool_index=self.current_tool_id,
147
+ name=function_name,
148
+ parameters="",
149
+ )
150
+ )
151
+
152
+ # Remove the processed function declaration
153
+ self._buf = self._buf[function_match.end() :]
154
+ continue
155
+ else:
156
+ # Invalid function name, reset state
157
+ logger.warning(f"Invalid function name: {function_name}")
158
+ self._reset_streaming_state()
159
+ normal += self._buf
160
+ self._buf = ""
161
+ break
162
+ else:
163
+ # Function name not complete yet, wait for more text
164
+ break
165
+
166
+ # Parse parameters incrementally
167
+ if self._function_name_sent:
168
+ # Process parameters and get any calls to emit
169
+ parameter_calls = self._parse_and_stream_parameters(self._buf)
170
+ calls.extend(parameter_calls)
171
+
172
+ # Check if tool call is complete
173
+ if self.tool_call_end_token in self._buf:
174
+ end_pos = self._buf.find(self.tool_call_end_token)
175
+
176
+ # Add closing brace to complete the JSON object
177
+ current_streamed = self.streamed_args_for_tool[self.current_tool_id]
178
+ if current_streamed:
179
+ # Count opening and closing braces to check if JSON is complete
180
+ open_braces = current_streamed.count("{")
181
+ close_braces = current_streamed.count("}")
182
+ if open_braces > close_braces:
183
+ calls.append(
184
+ ToolCallItem(
185
+ tool_index=self.current_tool_id,
186
+ name=None,
187
+ parameters="}",
188
+ )
189
+ )
190
+ self.streamed_args_for_tool[self.current_tool_id] = (
191
+ current_streamed + "}"
192
+ )
193
+
194
+ # Complete the tool call
195
+ self._buf = self._buf[end_pos + len(self.tool_call_end_token) :]
196
+ self._reset_streaming_state()
197
+ self.current_tool_id += 1
198
+ continue
199
+ else:
200
+ # Tool call not complete yet, wait for more text
201
+ break
202
+
88
203
  return StreamingParseResult(normal_text=normal, calls=calls)
89
204
 
205
+ def _parse_and_stream_parameters(self, text_to_parse: str) -> List[ToolCallItem]:
206
+ """
207
+ Parse complete parameter blocks from text and return any tool call items to emit.
208
+
209
+ This method:
210
+ 1. Finds all complete <parameter> blocks
211
+ 2. Parses them into a dictionary
212
+ 3. Compares with current parameters and generates diff if needed
213
+ 4. Updates internal state
214
+
215
+ Args:
216
+ text_to_parse: The text to search for parameter blocks
217
+
218
+ Returns:
219
+ List of ToolCallItem objects to emit (may be empty)
220
+ """
221
+ calls: List[ToolCallItem] = []
222
+
223
+ # Find all complete parameter patterns
224
+ param_matches = list(
225
+ re.finditer(
226
+ r"<parameter=([^>]+)>(.*?)</parameter>", text_to_parse, re.DOTALL
227
+ )
228
+ )
229
+
230
+ # Build new parameters dictionary
231
+ new_params = {}
232
+ for match in param_matches:
233
+ param_name = match.group(1).strip()
234
+ param_value = match.group(2)
235
+ new_params[param_name] = _safe_val(param_value)
236
+
237
+ # Calculate parameter diff to stream with proper incremental JSON building
238
+ if new_params != self._current_parameters:
239
+ previous_args_json = self.streamed_args_for_tool[self.current_tool_id]
240
+
241
+ # Build incremental JSON properly
242
+ if not self._current_parameters:
243
+ # First parameter(s) - start JSON object but don't close it yet
244
+ items = []
245
+ for key, value in new_params.items():
246
+ items.append(
247
+ f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
248
+ )
249
+ json_fragment = "{" + ", ".join(items)
250
+
251
+ calls.append(
252
+ ToolCallItem(
253
+ tool_index=self.current_tool_id,
254
+ name=None,
255
+ parameters=json_fragment,
256
+ )
257
+ )
258
+ self.streamed_args_for_tool[self.current_tool_id] = json_fragment
259
+
260
+ else:
261
+ # Additional parameters - add them incrementally
262
+ new_keys = set(new_params.keys()) - set(self._current_parameters.keys())
263
+ if new_keys:
264
+ # Build the continuation part (no closing brace yet)
265
+ continuation_parts = []
266
+ for key in new_keys:
267
+ value = new_params[key]
268
+ continuation_parts.append(
269
+ f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
270
+ )
271
+
272
+ json_fragment = ", " + ", ".join(continuation_parts)
273
+
274
+ calls.append(
275
+ ToolCallItem(
276
+ tool_index=self.current_tool_id,
277
+ name=None,
278
+ parameters=json_fragment,
279
+ )
280
+ )
281
+ self.streamed_args_for_tool[self.current_tool_id] = (
282
+ previous_args_json + json_fragment
283
+ )
284
+
285
+ # Update current state
286
+ self._current_parameters = new_params
287
+ self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
288
+
289
+ return calls
290
+
291
+ def _reset_streaming_state(self):
292
+ """Reset streaming state for the next tool call"""
293
+ self._in_tool_call = False
294
+ self._function_name_sent = False
295
+ self._current_function_name = ""
296
+ self._current_parameters = {}
297
+ self._streamed_parameters = {}
298
+ self.current_tool_name_sent = False
299
+
90
300
  def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
91
301
  normal_parts: List[str] = []
92
302
  calls: List[ToolCallItem] = []
@@ -14,10 +14,11 @@
14
14
  """Utilities for Huggingface Transformers."""
15
15
 
16
16
  import contextlib
17
+ import json
17
18
  import os
18
19
  import warnings
19
20
  from pathlib import Path
20
- from typing import Dict, Optional, Type, Union
21
+ from typing import Any, Dict, Optional, Type, Union
21
22
 
22
23
  import torch
23
24
  from huggingface_hub import snapshot_download
@@ -62,11 +63,17 @@ for name, cls in _CONFIG_REGISTRY.items():
62
63
  AutoConfig.register(name, cls)
63
64
 
64
65
 
65
- def download_from_hf(model_path: str):
66
+ def download_from_hf(
67
+ model_path: str,
68
+ allow_patterns: Optional[Union[str, list]] = None,
69
+ ):
66
70
  if os.path.exists(model_path):
67
71
  return model_path
68
72
 
69
- return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"])
73
+ if not allow_patterns:
74
+ allow_patterns = ["*.json", "*.bin", "*.model"]
75
+
76
+ return snapshot_download(model_path, allow_patterns=allow_patterns)
70
77
 
71
78
 
72
79
  def get_hf_text_config(config: PretrainedConfig):
@@ -171,6 +178,26 @@ def get_generation_config(
171
178
  return None
172
179
 
173
180
 
181
+ # Qwen-1M related
182
+ def get_sparse_attention_config(
183
+ model: str,
184
+ sparse_attention_config_filename: str = "sparse_attention_config.json",
185
+ ) -> Dict[str, Any]:
186
+ is_local = os.path.isdir(model)
187
+ if not is_local:
188
+ # Download the config files.
189
+ model = download_from_hf(model, allow_patterns=["*.json"])
190
+
191
+ config_file = os.path.join(model, sparse_attention_config_filename)
192
+ if not os.path.exists(config_file):
193
+ return {}
194
+
195
+ # Load the sparse attention config.
196
+ with open(config_file) as f:
197
+ config = json.load(f)
198
+ return config
199
+
200
+
174
201
  # Models don't use the same configuration key for determining the maximum
175
202
  # context length. Store them here so we can sanely check them.
176
203
  # NOTE: The ordering here is important. Some models have two of these and we
@@ -9,6 +9,8 @@ import logging
9
9
  import jinja2
10
10
  import transformers.utils.chat_template_utils as hf_chat_utils
11
11
 
12
+ from sglang.srt.utils import ImageData
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
  # ============================================================================
@@ -100,6 +102,12 @@ def detect_jinja_template_content_format(chat_template: str) -> str:
100
102
  if _is_var_or_elems_access(loop_iter, "message", "content"):
101
103
  return "openai" # Found content iteration → openai format
102
104
 
105
+ # Also check for patterns like: {%- for item in msg.content -%} or {%- for item in m.content -%}
106
+ if _is_var_or_elems_access(
107
+ loop_iter, "msg", "content"
108
+ ) or _is_var_or_elems_access(loop_iter, "m", "content"):
109
+ return "openai" # Found content iteration → openai format (glm4v)
110
+
103
111
  return "string" # No content loops found → string format
104
112
  except Exception as e:
105
113
  logger.debug(f"Error when parsing AST of Jinja template: {e}")
@@ -140,7 +148,12 @@ def process_content_for_template_format(
140
148
  chunk_type = chunk.get("type")
141
149
 
142
150
  if chunk_type == "image_url":
143
- image_data.append(chunk["image_url"]["url"])
151
+ image_data.append(
152
+ ImageData(
153
+ url=chunk["image_url"]["url"],
154
+ detail=chunk["image_url"].get("detail", "auto"),
155
+ )
156
+ )
144
157
  if chunk.get("modalities"):
145
158
  modalities.append(chunk.get("modalities"))
146
159
  # Normalize to simple 'image' type for template compatibility