sglang 0.5.1.post1__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sglang/bench_one_batch_server.py +79 -53
  2. sglang/bench_serving.py +186 -14
  3. sglang/profiler.py +0 -1
  4. sglang/srt/conversation.py +38 -5
  5. sglang/srt/disaggregation/decode.py +4 -0
  6. sglang/srt/disaggregation/prefill.py +4 -0
  7. sglang/srt/entrypoints/engine.py +2 -2
  8. sglang/srt/entrypoints/openai/protocol.py +27 -24
  9. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  10. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  11. sglang/srt/entrypoints/tool.py +7 -7
  12. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  13. sglang/srt/function_call/function_call_parser.py +2 -0
  14. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  15. sglang/srt/harmony_parser.py +588 -0
  16. sglang/srt/hf_transformers_utils.py +16 -7
  17. sglang/srt/layers/attention/ascend_backend.py +218 -111
  18. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  19. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  20. sglang/srt/layers/attention/flashinfer_mla_backend.py +76 -91
  21. sglang/srt/layers/attention/utils.py +15 -94
  22. sglang/srt/layers/communicator.py +1 -2
  23. sglang/srt/layers/moe/cutlass_moe.py +0 -15
  24. sglang/srt/layers/moe/ep_moe/layer.py +1 -7
  25. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  27. sglang/srt/layers/moe/topk.py +1 -1
  28. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  29. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
  30. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  31. sglang/srt/layers/quantization/fp8.py +2 -1
  32. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  33. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  34. sglang/srt/layers/quantization/modelopt_quant.py +2 -2
  35. sglang/srt/layers/quantization/mxfp4.py +16 -23
  36. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  37. sglang/srt/layers/utils.py +0 -14
  38. sglang/srt/lora/lora_manager.py +29 -12
  39. sglang/srt/managers/cache_controller.py +223 -156
  40. sglang/srt/managers/detokenizer_manager.py +5 -0
  41. sglang/srt/managers/io_struct.py +30 -0
  42. sglang/srt/managers/scheduler.py +58 -7
  43. sglang/srt/managers/scheduler_metrics_mixin.py +15 -0
  44. sglang/srt/managers/tokenizer_manager.py +36 -3
  45. sglang/srt/mem_cache/hicache_storage.py +31 -20
  46. sglang/srt/mem_cache/hiradix_cache.py +12 -3
  47. sglang/srt/mem_cache/memory_pool.py +73 -14
  48. sglang/srt/mem_cache/memory_pool_host.py +3 -2
  49. sglang/srt/mem_cache/radix_cache.py +1 -0
  50. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
  51. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
  52. sglang/srt/metrics/collector.py +5 -5
  53. sglang/srt/model_executor/cuda_graph_runner.py +2 -2
  54. sglang/srt/model_executor/model_runner.py +1 -1
  55. sglang/srt/models/deepseek_v2.py +12 -3
  56. sglang/srt/models/gpt_oss.py +2 -1
  57. sglang/srt/models/qwen2_5_vl.py +1 -0
  58. sglang/srt/offloader.py +115 -0
  59. sglang/srt/reasoning_parser.py +56 -300
  60. sglang/srt/server_args.py +10 -5
  61. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  62. sglang/srt/utils.py +59 -12
  63. sglang/test/test_cutlass_moe.py +33 -28
  64. sglang/version.py +1 -1
  65. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +6 -5
  66. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +69 -65
  67. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
  68. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
  69. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,588 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from typing import Iterator, List, Optional, Tuple
4
+
5
+
6
+ @dataclass
7
+ class Event:
8
+ """Represents a parsed event from the Harmony stream."""
9
+
10
+ event_type: str
11
+ content: str
12
+ raw_text: str = None # Original text including structural markers
13
+
14
+
15
+ @dataclass
16
+ class Token:
17
+ """A structural token in the Harmony format."""
18
+
19
+ type: str
20
+ start: int
21
+ end: int
22
+
23
+
24
+ def prefix_hold(text: str, tokens: List[str]) -> Tuple[str, str]:
25
+ """
26
+ Holds back the longest suffix of `text` that could be a prefix of any token.
27
+ Returns (emit_now, keep_for_later).
28
+ """
29
+ if not text:
30
+ return "", ""
31
+ max_hold = 0
32
+ for tok in tokens:
33
+ if not tok:
34
+ continue
35
+ # Check for prefixes of tok in the suffix of text
36
+ L = min(len(tok) - 1, len(text))
37
+ for k in range(L, 0, -1):
38
+ if tok.startswith(text[-k:]):
39
+ max_hold = max(max_hold, k)
40
+ break
41
+ if max_hold == 0:
42
+ return text, ""
43
+ return text[:-max_hold], text[-max_hold:]
44
+
45
+
46
+ def iter_tokens(text: str, start_pos: int = 0) -> Iterator[Token]:
47
+ """Iterate over structural tokens in left-to-right order."""
48
+ TOKENS = {
49
+ "<|start|>": "START",
50
+ "<|channel|>": "CHANNEL",
51
+ "<|message|>": "MESSAGE",
52
+ "<|constrain|>": "CONSTRAIN",
53
+ "<|end|>": "END",
54
+ "<|call|>": "CALL",
55
+ "<|return|>": "RETURN",
56
+ }
57
+
58
+ pos = start_pos
59
+ has_unknown_tokens = False
60
+ while pos < len(text):
61
+ # Find next "<|"
62
+ marker_pos = text.find("<|", pos)
63
+ if marker_pos == -1:
64
+ break
65
+
66
+ # Emit any text before the marker
67
+ if marker_pos > pos:
68
+ yield Token("TEXT", pos, marker_pos)
69
+
70
+ # Check which token it is
71
+ found_token = False
72
+
73
+ for literal, token_type in TOKENS.items():
74
+ if text.startswith(literal, marker_pos):
75
+ yield Token(token_type, marker_pos, marker_pos + len(literal))
76
+ pos = marker_pos + len(literal)
77
+ found_token = True
78
+ break
79
+ if not found_token:
80
+ tail = text[marker_pos:]
81
+ is_partial = any(lit.startswith(tail) for lit in TOKENS)
82
+ if is_partial:
83
+ # Hold whole tail (partial token)
84
+ yield Token("TEXT", marker_pos, len(text))
85
+ pos = len(text)
86
+ break
87
+ else:
88
+ # Unknown token like <|weird|> ...
89
+ has_unknown_tokens = True
90
+ # Emit the "<|" as a TEXT token first
91
+ yield Token("TEXT", marker_pos, marker_pos + 2)
92
+
93
+ # Try to find a closing "|>" for this unknown token
94
+ close_pos = text.find("|>", marker_pos + 2)
95
+ if close_pos != -1:
96
+ # Look ahead to the next structural token after the unknown close
97
+ next_marker = text.find("<|", close_pos + 2)
98
+ if next_marker != -1:
99
+ # Emit the unknown body + any following plain text up to next marker
100
+ yield Token("TEXT", marker_pos + 2, next_marker)
101
+ pos = next_marker
102
+ else:
103
+ # Emit until the end
104
+ yield Token("TEXT", marker_pos + 2, len(text))
105
+ pos = len(text)
106
+ break
107
+ else:
108
+ # No closing; advance past "<|" and continue scanning
109
+ pos = marker_pos + 2
110
+
111
+ # Emit any remaining text
112
+ if pos < len(text):
113
+ yield Token("TEXT", pos, len(text))
114
+ elif pos == len(text) and has_unknown_tokens:
115
+ # Add an empty trailing TEXT token only when we encountered unknown tokens
116
+ # and the text ends with a known structural token. This matches expected tests.
117
+ for literal in TOKENS.keys():
118
+ if text.endswith(literal):
119
+ yield Token("TEXT", pos, pos)
120
+ break
121
+
122
+
123
+ class CanonicalStrategy:
124
+ """Parses the canonical Harmony format with channel markers."""
125
+
126
+ def __init__(self):
127
+ self.guard_tokens = [
128
+ "<|start|>",
129
+ "<|channel|>",
130
+ "<|message|>",
131
+ "<|constrain|>",
132
+ "<|end|>",
133
+ "<|call|>",
134
+ "<|return|>",
135
+ ]
136
+
137
+ def parse(self, text: str) -> Tuple[List[Event], str]:
138
+ events = []
139
+ tokens = list(iter_tokens(text))
140
+
141
+ if not tokens:
142
+ return events, ""
143
+
144
+ pos = 0
145
+ while pos < len(tokens):
146
+ token = tokens[pos]
147
+
148
+ if token.type == "TEXT":
149
+ # Check if this might be incomplete
150
+ if pos == len(tokens) - 1: # Last token
151
+ emit, hold = prefix_hold(
152
+ text[token.start : token.end], self.guard_tokens
153
+ )
154
+ if emit:
155
+ events.append(Event("normal", emit))
156
+ return events, hold
157
+ else:
158
+ # Check if this might be commentary filler between blocks
159
+ if self._is_commentary_filler_between_blocks(text, tokens, pos):
160
+ # Skip this filler text - don't emit as normal content
161
+ pos += 1
162
+ else:
163
+ content = text[token.start : token.end]
164
+ # Skip standalone structural tokens that shouldn't be emitted as normal text
165
+ if not self._is_standalone_structural_token(content):
166
+ events.append(Event("normal", content))
167
+ pos += 1
168
+
169
+ elif token.type in ("START", "CHANNEL"):
170
+ # Parse a channel block starting here
171
+ block_result = self._parse_block(text, tokens, pos)
172
+ if block_result is None:
173
+ # Incomplete block - check if we can emit partial reasoning content
174
+ partial_result = self._parse_partial_analysis(text, tokens, pos)
175
+ if partial_result:
176
+ event, remaining_text = partial_result
177
+ events.append(event)
178
+ return events, remaining_text
179
+ # No partial content, hold entire remaining text
180
+ remaining_start = tokens[pos].start
181
+ return events, text[remaining_start:]
182
+ event, new_pos = block_result
183
+ if event:
184
+ events.append(event)
185
+ pos = new_pos
186
+
187
+ else:
188
+ # Check if this might be commentary filler between blocks
189
+ if self._is_commentary_filler_between_blocks(text, tokens, pos):
190
+ # Skip this filler text - don't emit as normal content
191
+ pos += 1
192
+ else:
193
+ # Unexpected token - only emit as text if it's not a standalone structural token
194
+ content = text[token.start : token.end]
195
+ if not self._is_standalone_structural_token(content):
196
+ events.append(Event("normal", content))
197
+ pos += 1
198
+
199
+ return events, ""
200
+
201
+ def _parse_partial_analysis(
202
+ self, text: str, tokens: List[Token], start_pos: int
203
+ ) -> Optional[Tuple[Event, str]]:
204
+ """Try to parse partial analysis content for incremental streaming."""
205
+ pos = start_pos
206
+
207
+ # Skip <|start|> if present
208
+ if pos < len(tokens) and tokens[pos].type == "START":
209
+ pos += 1
210
+
211
+ # Look for <|channel|> followed by analysis
212
+ channel_pos = None
213
+ message_pos = None
214
+
215
+ for i in range(pos, len(tokens)):
216
+ if tokens[i].type == "CHANNEL" and channel_pos is None:
217
+ channel_pos = i
218
+ elif tokens[i].type == "MESSAGE":
219
+ message_pos = i
220
+ break
221
+
222
+ if channel_pos is None or message_pos is None:
223
+ return None
224
+
225
+ # Extract channel type
226
+ channel_start = (
227
+ tokens[channel_pos + 1].start
228
+ if channel_pos + 1 < len(tokens)
229
+ else tokens[channel_pos].end
230
+ )
231
+ channel_end = tokens[message_pos].start
232
+ channel_header = text[channel_start:channel_end]
233
+
234
+ channel_type = self._extract_channel_type(channel_header)
235
+ if channel_type != "analysis":
236
+ return None # Only stream analysis content - tool calls wait for completion
237
+
238
+ # Extract partial content after <|message|>
239
+ content_start = tokens[message_pos].end
240
+ content = text[content_start:]
241
+
242
+ # Return partial reasoning content and preserve the channel structure for next parse
243
+ remaining_text = text[tokens[start_pos].start : content_start]
244
+ return Event("reasoning", content), remaining_text
245
+
246
+ def _extract_channel_type(self, header_text: str) -> Optional[str]:
247
+ """Extract channel type from header, ignoring other attributes like to=... or <|constrain|>..."""
248
+ # Look for channel type at the start of the header (case insensitive)
249
+ header_clean = header_text.strip()
250
+
251
+ if header_clean.lower().startswith("analysis"):
252
+ return "analysis"
253
+ elif header_clean.lower().startswith("commentary"):
254
+ return "commentary"
255
+ elif header_clean.lower().startswith("final"):
256
+ return "final"
257
+ else:
258
+ return None # Unknown channel type
259
+
260
+ def _parse_block(
261
+ self, text: str, tokens: List[Token], start_pos: int
262
+ ) -> Optional[Tuple[Optional[Event], int]]:
263
+ """Parse a channel block. Returns (event, next_pos) or None if incomplete."""
264
+ pos = start_pos
265
+
266
+ # Skip <|start|> if present
267
+ if pos < len(tokens) and tokens[pos].type == "START":
268
+ pos += 1
269
+
270
+ # Look for <|channel|> or <|message|> (tool responses go direct to message)
271
+ channel_pos = None
272
+ message_pos = None
273
+
274
+ for i in range(pos, len(tokens)):
275
+ if tokens[i].type == "CHANNEL" and channel_pos is None:
276
+ channel_pos = i
277
+ elif tokens[i].type == "MESSAGE":
278
+ message_pos = i
279
+ break
280
+
281
+ if message_pos is None:
282
+ return None # No message token found
283
+
284
+ # If no channel found, this is a tool response - treat as normal text
285
+ if channel_pos is None:
286
+ content_start = tokens[message_pos].end
287
+ # Find end token after message
288
+ end_token_pos = None
289
+ for i in range(message_pos + 1, len(tokens)):
290
+ if tokens[i].type in ("END", "CALL", "RETURN"):
291
+ end_token_pos = i
292
+ break
293
+ if end_token_pos is None:
294
+ return None # Incomplete
295
+ content = text[content_start : tokens[end_token_pos].start]
296
+ return Event("normal", content), end_token_pos + 1
297
+
298
+ # Standard channel block processing - message_pos is already found above
299
+ pos = channel_pos + 1 # Skip CHANNEL token
300
+
301
+ # Extract channel type from header (ignoring other attributes like to=... or <|constrain|>...)
302
+ channel_start = tokens[pos].start if pos < len(tokens) else tokens[pos - 1].end
303
+ channel_end = tokens[message_pos].start
304
+ channel_header = text[channel_start:channel_end]
305
+
306
+ channel_type = self._extract_channel_type(channel_header)
307
+ if not channel_type:
308
+ return None # Unknown or malformed channel
309
+
310
+ pos = message_pos + 1 # Skip MESSAGE token
311
+
312
+ # Find content and end token
313
+ content_start = tokens[message_pos].end
314
+ end_pos = pos
315
+
316
+ # Each channel type has specific valid end tokens
317
+ if channel_type == "final":
318
+ while end_pos < len(tokens) and tokens[end_pos].type != "RETURN":
319
+ end_pos += 1
320
+ elif channel_type == "analysis":
321
+ while end_pos < len(tokens) and tokens[end_pos].type not in ("END", "CALL"):
322
+ end_pos += 1
323
+ else: # commentary
324
+ while end_pos < len(tokens) and tokens[end_pos].type not in ("END", "CALL"):
325
+ end_pos += 1
326
+
327
+ if end_pos >= len(tokens):
328
+ # No end token found
329
+ if channel_type == "final":
330
+ # Final blocks can end at end of input without requiring <|return|>
331
+ content = text[content_start:]
332
+ return Event("normal", content), end_pos
333
+ return None # Analysis and commentary need proper end tokens
334
+
335
+ end_token = tokens[end_pos]
336
+ content = text[content_start : end_token.start]
337
+
338
+ # Create event based on channel and end token
339
+ if channel_type == "analysis":
340
+ if end_token.type == "CALL":
341
+ # Built-in tools (browser, python) use analysis channel with <|call|>
342
+ raw_text = text[tokens[start_pos].start : end_token.end]
343
+ return Event("tool_call", content.strip(), raw_text), end_pos + 1
344
+ else:
345
+ return Event("reasoning", content), end_pos + 1
346
+ elif channel_type == "commentary":
347
+ if end_token.type == "CALL":
348
+ raw_text = text[tokens[start_pos].start : end_token.end]
349
+ return Event("tool_call", content.strip(), raw_text), end_pos + 1
350
+ else:
351
+ return Event("normal", content), end_pos + 1
352
+ elif channel_type == "final":
353
+ # For final blocks, include any trailing TEXT immediately after <|return|>
354
+ final_content = content
355
+ if end_token.type == "RETURN" and end_pos + 1 < len(tokens):
356
+ next_token = tokens[end_pos + 1]
357
+ if next_token.type == "TEXT":
358
+ final_content += text[next_token.start : next_token.end]
359
+ return Event("normal", final_content), end_pos + 2
360
+ return Event("normal", final_content), end_pos + 1
361
+
362
+ return None, end_pos + 1
363
+
364
+ def _is_commentary_filler_between_blocks(
365
+ self, text: str, tokens: List[Token], pos: int
366
+ ) -> bool:
367
+ """Check if this is commentary filler text or problematic structural tokens in malformed sequences."""
368
+ current_token = tokens[pos]
369
+ current_text = text[current_token.start : current_token.end].strip()
370
+
371
+ # Check for commentary filler between CALL and CHANNEL
372
+ if pos > 0 and pos + 1 < len(tokens):
373
+ prev_token = tokens[pos - 1]
374
+ next_token = tokens[pos + 1]
375
+
376
+ # Check if we have CALL -> TEXT("commentary") -> CHANNEL pattern
377
+ if (
378
+ prev_token.type == "CALL"
379
+ and next_token.type == "CHANNEL"
380
+ and current_text.lower() == "commentary"
381
+ ):
382
+ return True
383
+
384
+ # Check for problematic patterns after CALL tokens (malformed sequences)
385
+ if pos > 0:
386
+ prev_token = tokens[pos - 1]
387
+
388
+ # Only filter structural tokens that appear immediately after CALL in malformed sequences
389
+ # These patterns indicate the content is malformed and the structural tokens are noise
390
+ if prev_token.type == "CALL":
391
+ # Filter MESSAGE tokens after CALL (should not happen in well-formed content)
392
+ if current_token.type == "MESSAGE":
393
+ return True
394
+
395
+ # Filter standalone "commentary" text after CALL
396
+ if (
397
+ current_token.type == "TEXT"
398
+ and current_text.lower() == "commentary"
399
+ ):
400
+ return True
401
+
402
+ return False
403
+
404
+ def _is_standalone_structural_token(self, content: str) -> bool:
405
+ """Check if content is just a standalone structural token that should be filtered."""
406
+ content_stripped = content.strip()
407
+ structural_tokens = [
408
+ "<|start|>",
409
+ "<|channel|>",
410
+ "<|message|>",
411
+ "<|constrain|>",
412
+ "<|end|>",
413
+ "<|call|>",
414
+ "<|return|>",
415
+ ]
416
+ return content_stripped in structural_tokens
417
+
418
+
419
+ class TextStrategy:
420
+ """Parses the text-based Harmony fallback format."""
421
+
422
+ def __init__(self):
423
+ self.buffer_context = ""
424
+ self.patterns = {
425
+ "analysis_then_final": re.compile(
426
+ r"^\s*(?:assistant)?\s*(analysis|commentary)(.*?)\s*assistantfinal\s*(.*)\s*$",
427
+ re.IGNORECASE | re.DOTALL,
428
+ ),
429
+ "final_only": re.compile(
430
+ r"^\s*assistantfinal\s*(.*)\s*$", re.IGNORECASE | re.DOTALL
431
+ ),
432
+ "analysis_only": re.compile(
433
+ r"^\s*(?:assistant)?\s*(analysis|commentary)(.*)\s*$",
434
+ re.IGNORECASE | re.DOTALL,
435
+ ),
436
+ }
437
+
438
+ def set_buffer_context(self, buffer: str):
439
+ self.buffer_context = buffer
440
+
441
+ def parse(self, text: str) -> Tuple[List[Event], str]:
442
+ events = []
443
+
444
+ m = self.patterns["analysis_then_final"].match(text)
445
+ if m:
446
+ channel, reasoning, final = m.groups()
447
+ if channel.lower() == "analysis" and reasoning.strip():
448
+ events.append(Event("reasoning", reasoning.strip()))
449
+ elif channel.lower() == "commentary" and reasoning.strip():
450
+ events.append(Event("normal", reasoning.strip()))
451
+ if final.strip():
452
+ events.append(Event("normal", final.strip()))
453
+ return events, ""
454
+
455
+ # If assistantfinal appears to be incomplete (e.g., 'assistantfin'), hold entire buffer
456
+ if re.search(
457
+ r"(?:^|\s)(?:assistant)?\s*(analysis|commentary)", text, re.IGNORECASE
458
+ ):
459
+ low = text.lower()
460
+ if "assistantfin" in low and "assistantfinal" not in low:
461
+ return events, text
462
+
463
+ m = self.patterns["final_only"].match(text)
464
+ if m:
465
+ final = m.group(1)
466
+ if final.strip():
467
+ events.append(Event("normal", final.strip()))
468
+ return events, ""
469
+
470
+ m = self.patterns["analysis_only"].match(text)
471
+ if m:
472
+ channel, content = m.groups()
473
+ emit, hold = prefix_hold(content, ["assistantfinal"])
474
+ if channel.lower() == "analysis" and emit:
475
+ # Stream reasoning content as-is based on structural markers only.
476
+ events.append(Event("reasoning", emit))
477
+ # Keep the channel header in the remaining buffer to continue parsing
478
+ # subsequent chunks in the text fallback format. Preserve any held
479
+ # prefix that may complete into "assistantfinal".
480
+ if hold:
481
+ return events, text[: m.start(2)] + hold
482
+ else:
483
+ return events, channel
484
+ elif channel.lower() == "commentary" and emit:
485
+ # For commentary, stream as normal text. Preserve spaces unless holding.
486
+ content_out = emit if hold else emit.strip()
487
+ events.append(Event("normal", content_out))
488
+ if hold:
489
+ return events, text[: m.start(2)] + hold
490
+ else:
491
+ return events, ""
492
+ # If no emit, just return the held content
493
+ return events, text[: m.start(2)] + hold
494
+
495
+ emit, hold = prefix_hold(text, ["analysis", "commentary", "assistantfinal"])
496
+ if emit:
497
+ events.append(Event("normal", emit))
498
+ return events, hold
499
+
500
+
501
+ class HarmonyParser:
502
+ """Facade for parsing Harmony format, switching between strategies."""
503
+
504
+ def __init__(self):
505
+ self.strategy = None
506
+ self._buffer = ""
507
+ self._should_filter_commentary = (
508
+ False # Track if we should filter commentary in next chunks
509
+ )
510
+ self._partial_commentary = (
511
+ "" # Track partial commentary being built across chunks
512
+ )
513
+
514
+ def parse(self, chunk: str) -> List[Event]:
515
+ self._buffer += chunk
516
+
517
+ if self.strategy is None:
518
+ if "<|channel|>" in self._buffer or "<|start|>" in self._buffer:
519
+ self.strategy = CanonicalStrategy()
520
+ elif re.search(
521
+ r"(?:^|\s)(?:assistant)?\s*(analysis|commentary|assistantfinal)",
522
+ self._buffer,
523
+ re.IGNORECASE,
524
+ ):
525
+ self.strategy = TextStrategy()
526
+ else:
527
+ # Not yet determined, hold
528
+ return []
529
+
530
+ if hasattr(self.strategy, "set_buffer_context"):
531
+ # Provide full buffer context to strategy for smarter whitespace handling
532
+ self.strategy.set_buffer_context(self._buffer)
533
+
534
+ events, remaining = self.strategy.parse(self._buffer)
535
+
536
+ # Check if we should start filtering commentary (after <|call|> token or tool_call event)
537
+ buffer_has_call_token = self._buffer.rstrip().endswith("<|call|>")
538
+
539
+ self._buffer = remaining
540
+
541
+ # Filter events for streaming case
542
+ filtered_events = []
543
+ for event in events:
544
+ should_filter = False
545
+
546
+ if event.event_type == "normal":
547
+ # Check if we're in a commentary filtering state
548
+ if self._should_filter_commentary or self._partial_commentary:
549
+ # Try to build partial commentary
550
+ potential_commentary = (
551
+ self._partial_commentary + event.content.strip().lower()
552
+ )
553
+
554
+ if potential_commentary == "commentary":
555
+ # Complete commentary found - filter it
556
+ should_filter = True
557
+ self._partial_commentary = "" # Reset
558
+ self._should_filter_commentary = False # Done filtering
559
+ elif "commentary".startswith(potential_commentary):
560
+ # Partial match - accumulate and filter this chunk
561
+ should_filter = True
562
+ self._partial_commentary = potential_commentary
563
+ else:
564
+ # Not commentary - reset and keep the event
565
+ self._partial_commentary = ""
566
+ self._should_filter_commentary = False
567
+ else:
568
+ # Not in commentary filtering state - reset partial state
569
+ self._partial_commentary = ""
570
+
571
+ if should_filter:
572
+ # Skip this commentary filler
573
+ continue
574
+
575
+ # Update filtering state based on events and buffer state
576
+ if event.event_type == "tool_call":
577
+ self._should_filter_commentary = (
578
+ True # Filter commentary after tool calls
579
+ )
580
+ self._partial_commentary = "" # Reset on tool call
581
+ elif buffer_has_call_token:
582
+ self._should_filter_commentary = (
583
+ True # Filter commentary after <|call|> token
584
+ )
585
+
586
+ filtered_events.append(event)
587
+
588
+ return filtered_events
@@ -368,13 +368,22 @@ def get_processor(
368
368
  if config.model_type not in {"llava", "clip"}:
369
369
  kwargs["use_fast"] = use_fast
370
370
  try:
371
- processor = AutoProcessor.from_pretrained(
372
- tokenizer_name,
373
- *args,
374
- trust_remote_code=trust_remote_code,
375
- revision=revision,
376
- **kwargs,
377
- )
371
+ if "InternVL3_5" in tokenizer_name:
372
+ processor = AutoTokenizer.from_pretrained(
373
+ tokenizer_name,
374
+ *args,
375
+ trust_remote_code=trust_remote_code,
376
+ revision=revision,
377
+ **kwargs,
378
+ )
379
+ else:
380
+ processor = AutoProcessor.from_pretrained(
381
+ tokenizer_name,
382
+ *args,
383
+ trust_remote_code=trust_remote_code,
384
+ revision=revision,
385
+ **kwargs,
386
+ )
378
387
 
379
388
  except ValueError as e:
380
389
  error_message = str(e)