sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. sglang/bench_offline_throughput.py +4 -2
  2. sglang/bench_one_batch.py +2 -2
  3. sglang/bench_one_batch_server.py +143 -15
  4. sglang/bench_serving.py +9 -7
  5. sglang/compile_deep_gemm.py +1 -1
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +78 -78
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +2 -2
  11. sglang/srt/configs/model_config.py +1 -0
  12. sglang/srt/constrained/base_grammar_backend.py +55 -72
  13. sglang/srt/constrained/llguidance_backend.py +25 -21
  14. sglang/srt/constrained/outlines_backend.py +27 -26
  15. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  16. sglang/srt/constrained/xgrammar_backend.py +69 -43
  17. sglang/srt/conversation.py +48 -43
  18. sglang/srt/disaggregation/base/conn.py +1 -0
  19. sglang/srt/disaggregation/decode.py +7 -2
  20. sglang/srt/disaggregation/fake/conn.py +1 -1
  21. sglang/srt/disaggregation/mooncake/conn.py +227 -120
  22. sglang/srt/disaggregation/nixl/conn.py +1 -0
  23. sglang/srt/disaggregation/prefill.py +7 -4
  24. sglang/srt/disaggregation/utils.py +7 -1
  25. sglang/srt/entrypoints/engine.py +17 -2
  26. sglang/srt/entrypoints/http_server.py +17 -2
  27. sglang/srt/function_call_parser.py +2 -2
  28. sglang/srt/layers/attention/flashattention_backend.py +1 -1
  29. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  30. sglang/srt/layers/attention/utils.py +4 -2
  31. sglang/srt/layers/dp_attention.py +71 -21
  32. sglang/srt/layers/layernorm.py +1 -1
  33. sglang/srt/layers/logits_processor.py +46 -11
  34. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  35. sglang/srt/layers/moe/ep_moe/layer.py +1 -1
  36. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
  37. sglang/srt/layers/moe/topk.py +1 -1
  38. sglang/srt/layers/quantization/__init__.py +1 -1
  39. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  40. sglang/srt/layers/quantization/deep_gemm.py +72 -71
  41. sglang/srt/layers/quantization/fp8.py +2 -2
  42. sglang/srt/layers/quantization/fp8_kernel.py +3 -3
  43. sglang/srt/layers/quantization/int8_kernel.py +2 -2
  44. sglang/srt/layers/sampler.py +0 -4
  45. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  46. sglang/srt/lora/lora_manager.py +1 -1
  47. sglang/srt/lora/mem_pool.py +4 -4
  48. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  49. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  50. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  51. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  52. sglang/srt/lora/utils.py +1 -1
  53. sglang/srt/managers/data_parallel_controller.py +3 -3
  54. sglang/srt/managers/detokenizer_manager.py +21 -8
  55. sglang/srt/managers/io_struct.py +3 -1
  56. sglang/srt/managers/mm_utils.py +1 -1
  57. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  58. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  59. sglang/srt/managers/schedule_batch.py +76 -24
  60. sglang/srt/managers/schedule_policy.py +0 -3
  61. sglang/srt/managers/scheduler.py +113 -88
  62. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  63. sglang/srt/managers/tokenizer_manager.py +133 -34
  64. sglang/srt/managers/tp_worker.py +12 -9
  65. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  66. sglang/srt/mem_cache/memory_pool.py +2 -0
  67. sglang/srt/metrics/collector.py +312 -37
  68. sglang/srt/model_executor/cuda_graph_runner.py +10 -11
  69. sglang/srt/model_executor/forward_batch_info.py +1 -1
  70. sglang/srt/model_executor/model_runner.py +19 -14
  71. sglang/srt/models/deepseek_janus_pro.py +2 -2
  72. sglang/srt/models/deepseek_v2.py +23 -20
  73. sglang/srt/models/llama.py +2 -0
  74. sglang/srt/models/llama4.py +5 -6
  75. sglang/srt/models/llava.py +248 -5
  76. sglang/srt/models/mixtral.py +98 -34
  77. sglang/srt/models/pixtral.py +467 -0
  78. sglang/srt/models/roberta.py +1 -1
  79. sglang/srt/models/torch_native_llama.py +1 -1
  80. sglang/srt/openai_api/adapter.py +30 -4
  81. sglang/srt/openai_api/protocol.py +0 -8
  82. sglang/srt/reasoning_parser.py +3 -3
  83. sglang/srt/sampling/custom_logit_processor.py +18 -3
  84. sglang/srt/sampling/sampling_batch_info.py +4 -56
  85. sglang/srt/sampling/sampling_params.py +2 -2
  86. sglang/srt/server_args.py +34 -4
  87. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  88. sglang/srt/speculative/eagle_utils.py +7 -7
  89. sglang/srt/speculative/eagle_worker.py +22 -19
  90. sglang/srt/utils.py +6 -5
  91. sglang/test/few_shot_gsm8k.py +2 -2
  92. sglang/test/few_shot_gsm8k_engine.py +2 -2
  93. sglang/test/run_eval.py +2 -2
  94. sglang/test/runners.py +8 -1
  95. sglang/test/send_one.py +13 -3
  96. sglang/test/simple_eval_common.py +1 -1
  97. sglang/test/simple_eval_humaneval.py +1 -1
  98. sglang/test/test_programs.py +5 -5
  99. sglang/test/test_utils.py +89 -14
  100. sglang/utils.py +1 -1
  101. sglang/version.py +1 -1
  102. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +6 -5
  103. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +107 -104
  104. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  105. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +0 -0
  106. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
  107. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,6 @@
13
13
  # ==============================================================================
14
14
  """The baseclass of a backend for reasoner grammar-guided constrained decoding."""
15
15
 
16
- from concurrent.futures import Future
17
16
  from typing import List, Optional, Tuple
18
17
 
19
18
  import torch
@@ -28,13 +27,12 @@ class ReasonerGrammarObject(BaseGrammarObject):
28
27
  self.think_end_id = think_end_id
29
28
  self.is_in_reasoning = True
30
29
 
31
- @property
32
- def finished(self):
33
- return self.grammar.finished
30
+ def accept_token(self, token: int):
31
+ if token == self.think_end_id:
32
+ self.is_in_reasoning = False
34
33
 
35
- @finished.setter
36
- def finished(self, finished):
37
- self.grammar.finished = finished
34
+ if not self.is_in_reasoning and token != self.think_end_id:
35
+ self.grammar.accept_token(token)
38
36
 
39
37
  def allocate_vocab_mask(
40
38
  self, vocab_size: int, batch_size: int, device
@@ -52,12 +50,16 @@ class ReasonerGrammarObject(BaseGrammarObject):
52
50
  def apply_vocab_mask(self):
53
51
  return self.grammar.apply_vocab_mask
54
52
 
55
- def accept_token(self, token: int):
56
- if token == self.think_end_id:
57
- self.is_in_reasoning = False
53
+ def copy(self) -> BaseGrammarObject:
54
+ return ReasonerGrammarObject(self.grammar.copy(), self.think_end_id)
58
55
 
59
- if not self.is_in_reasoning and token != self.think_end_id:
60
- self.grammar.accept_token(token)
56
+ @property
57
+ def finished(self):
58
+ return self.grammar.finished
59
+
60
+ @finished.setter
61
+ def finished(self, finished):
62
+ self.grammar.finished = finished
61
63
 
62
64
  def try_jump_forward(self, tokenizer):
63
65
  return self.grammar.try_jump_forward(tokenizer)
@@ -72,30 +74,17 @@ class ReasonerGrammarObject(BaseGrammarObject):
72
74
  old_output_ids, new_output_ids, next_state
73
75
  )
74
76
 
75
- def copy(self) -> BaseGrammarObject:
76
- return ReasonerGrammarObject(self.grammar.copy(), self.think_end_id)
77
-
78
77
 
79
78
  class ReasonerGrammarBackend(BaseGrammarBackend):
80
79
  def __init__(self, grammar_backend: BaseGrammarBackend, think_end_id):
80
+ super().__init__()
81
81
  self.grammar_backend = grammar_backend
82
82
  self.think_end_id = think_end_id
83
83
 
84
- def get_cached_value(self, key: Tuple[str, str]) -> Optional[ReasonerGrammarObject]:
85
- grammar = self.grammar_backend.get_cached_value(key)
86
- return ReasonerGrammarObject(grammar, self.think_end_id) if grammar else None
87
-
88
- def get_future_value(self, key: Tuple[str, str]) -> Future:
89
- grammar = Future()
90
-
91
- def callback(f: Future):
92
- if result := f.result():
93
- grammar.set_result(ReasonerGrammarObject(result, self.think_end_id))
94
- else:
95
- grammar.set_result(None)
96
-
97
- self.grammar_backend.get_future_value(key).add_done_callback(callback)
98
- return grammar
99
-
100
- def reset(self):
101
- self.grammar_backend.reset()
84
+ def _init_value_dispatch(
85
+ self, key: Tuple[str, str]
86
+ ) -> Optional[ReasonerGrammarObject]:
87
+ ret = self.grammar_backend._init_value_dispatch(key)
88
+ if ret is None:
89
+ return None
90
+ return ReasonerGrammarObject(ret, self.think_end_id)
@@ -18,7 +18,6 @@ import logging
18
18
  from typing import List, Optional, Tuple, Union
19
19
 
20
20
  import torch
21
- import xgrammar
22
21
  from xgrammar import (
23
22
  CompiledGrammar,
24
23
  GrammarCompiler,
@@ -35,7 +34,6 @@ from sglang.srt.constrained.base_grammar_backend import (
35
34
  from sglang.srt.constrained.triton_ops.bitmask_ops import (
36
35
  apply_token_bitmask_inplace_triton,
37
36
  )
38
- from sglang.srt.utils import get_bool_env_var
39
37
 
40
38
  logger = logging.getLogger(__name__)
41
39
 
@@ -51,49 +49,35 @@ class XGrammarGrammar(BaseGrammarObject):
51
49
  vocab_size: int,
52
50
  ctx: CompiledGrammar,
53
51
  override_stop_tokens: Optional[Union[List[int], int]],
52
+ key_string: Optional[str] = None, # TODO (sk): for debugging, remove later
54
53
  ) -> None:
55
- super().__init__()
56
54
  self.matcher = matcher
57
55
  self.vocab_size = vocab_size
58
56
  self.ctx = ctx
59
57
  self.override_stop_tokens = override_stop_tokens
60
58
  self.finished = False
61
-
62
- from xgrammar.kernels.apply_token_bitmask_inplace_cpu import (
63
- apply_token_bitmask_inplace_cpu,
64
- )
65
-
66
- self.apply_vocab_mask_cpu = apply_token_bitmask_inplace_cpu
59
+ self.accepted_tokens = []
60
+ self.key_string = key_string
67
61
 
68
62
  def accept_token(self, token: int):
69
- assert self.matcher.accept_token(token)
70
-
71
- def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
72
- s = self.matcher.find_jump_forward_string()
73
- if s:
74
- return [], s
75
- return None
76
-
77
- def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
78
- _, data = helper
79
- return data, -1
80
-
81
- def jump_and_retokenize(
82
- self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
83
- ):
84
- k = 0
85
- for i, old_id in enumerate(old_output_ids):
86
- if old_id == new_output_ids[i]:
87
- k = i + 1
63
+ if not self.is_terminated():
64
+ accepted = self.matcher.accept_token(token)
65
+ if not accepted:
66
+ # log for debugging
67
+ raise ValueError(
68
+ f"Tokens not accepted: {token}\n"
69
+ f"Accepted tokens: {self.accepted_tokens}\n"
70
+ f"Key string: {self.key_string}"
71
+ )
88
72
  else:
89
- break
73
+ self.accepted_tokens.append(token)
90
74
 
91
- # rollback to the last token that is the same
92
- if k < len(old_output_ids):
93
- self.matcher.rollback(len(old_output_ids) - k)
75
+ def rollback(self, k: int):
76
+ self.matcher.rollback(k)
77
+ self.accepted_tokens = self.accepted_tokens[:-k]
94
78
 
95
- for i in range(k, len(new_output_ids)):
96
- assert self.matcher.accept_token(new_output_ids[i])
79
+ def is_terminated(self):
80
+ return self.matcher.is_terminated()
97
81
 
98
82
  def allocate_vocab_mask(
99
83
  self, vocab_size: int, batch_size: int, device
@@ -122,9 +106,43 @@ class XGrammarGrammar(BaseGrammarObject):
122
106
  override_stop_tokens=self.override_stop_tokens,
123
107
  )
124
108
  return XGrammarGrammar(
125
- matcher, self.vocab_size, self.ctx, self.override_stop_tokens
109
+ matcher,
110
+ self.vocab_size,
111
+ self.ctx,
112
+ self.override_stop_tokens,
113
+ self.key_string,
126
114
  )
127
115
 
116
+ def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
117
+ s = self.matcher.find_jump_forward_string()
118
+ if s:
119
+ return [], s
120
+ return None
121
+
122
+ def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
123
+ _, data = helper
124
+ return data, -1
125
+
126
+ def jump_and_retokenize(
127
+ self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
128
+ ):
129
+ k = 0
130
+ for i, old_id in enumerate(old_output_ids):
131
+ if old_id == new_output_ids[i]:
132
+ k = i + 1
133
+ else:
134
+ break
135
+
136
+ # rollback to the last token that is the same
137
+ if k < len(old_output_ids):
138
+ self.matcher.rollback(len(old_output_ids) - k)
139
+
140
+ for i in range(k, len(new_output_ids)):
141
+ assert self.matcher.accept_token(new_output_ids[i])
142
+
143
+ def __repr__(self):
144
+ return f"XGrammarGrammar({self.key_string=}, {self.accepted_tokens=})"
145
+
128
146
 
129
147
  class XGrammarGrammarBackend(BaseGrammarBackend):
130
148
  def __init__(
@@ -143,9 +161,15 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
143
161
  self.vocab_size = vocab_size
144
162
  self.override_stop_tokens = override_stop_tokens
145
163
 
146
- def _from_context(self, ctx: CompiledGrammar) -> XGrammarGrammar:
147
- matcher = GrammarMatcher(ctx, max_rollback_tokens=MAX_ROLLBACK_TOKENS)
148
- return XGrammarGrammar(matcher, self.vocab_size, ctx, self.override_stop_tokens)
164
+ def _from_context(self, ctx: CompiledGrammar, key_string: str) -> XGrammarGrammar:
165
+ matcher = GrammarMatcher(
166
+ ctx,
167
+ max_rollback_tokens=MAX_ROLLBACK_TOKENS,
168
+ override_stop_tokens=self.override_stop_tokens,
169
+ )
170
+ return XGrammarGrammar(
171
+ matcher, self.vocab_size, ctx, self.override_stop_tokens, key_string
172
+ )
149
173
 
150
174
  def dispatch_json(self, key_string: str) -> Optional[XGrammarGrammar]:
151
175
  try:
@@ -157,7 +181,7 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
157
181
  except RuntimeError as e:
158
182
  logging.warning(f"Skip invalid json_schema: json_schema={key_string}, {e=}")
159
183
  return None
160
- return self._from_context(ctx)
184
+ return self._from_context(ctx, key_string)
161
185
 
162
186
  def dispatch_ebnf(self, key_string: str) -> Optional[XGrammarGrammar]:
163
187
  try:
@@ -165,7 +189,7 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
165
189
  except RuntimeError as e:
166
190
  logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
167
191
  return None
168
- return self._from_context(ctx)
192
+ return self._from_context(ctx, key_string)
169
193
 
170
194
  def dispatch_regex(self, key_string: str) -> Optional[XGrammarGrammar]:
171
195
  try:
@@ -173,7 +197,7 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
173
197
  except RuntimeError as e:
174
198
  logging.warning(f"Skip invalid regex: regex={key_string}, {e=}")
175
199
  return None
176
- return self._from_context(ctx)
200
+ return self._from_context(ctx, key_string)
177
201
 
178
202
  def dispatch_structural_tag(self, key_string: str) -> Optional[XGrammarGrammar]:
179
203
  try:
@@ -190,9 +214,11 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
190
214
  tags, structural_tag["triggers"]
191
215
  )
192
216
  except RuntimeError as e:
193
- logging.warning(f"Skip invalid regex: regex={key_string}, {e=}")
217
+ logging.warning(
218
+ f"Skip invalid structural_tag: structural_tag={key_string}, {e=}"
219
+ )
194
220
  return None
195
- return self._from_context(ctx)
221
+ return self._from_context(ctx, key_string)
196
222
 
197
223
  def reset(self):
198
224
  if self.grammar_compiler:
@@ -16,6 +16,7 @@
16
16
  # Adapted from
17
17
  # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
18
18
  import dataclasses
19
+ import re
19
20
  from enum import IntEnum, auto
20
21
  from typing import Callable, Dict, List, Optional, Tuple, Union
21
22
 
@@ -633,6 +634,20 @@ register_conv_template(
633
634
  )
634
635
  )
635
636
 
637
+ # reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
638
+ register_conv_template(
639
+ Conversation(
640
+ name="mistral",
641
+ system_template="[SYSTEM_PROMPT]\n{system_message}\n[/SYSTEM_PROMPT]\n\n",
642
+ roles=("[INST]", "[/INST]"),
643
+ sep_style=SeparatorStyle.LLAMA2,
644
+ sep=" ",
645
+ sep2=" </s><s>",
646
+ stop_str=["[INST]", "[/INST]", "[SYSTEM_PROMPT]", "[/SYSTEM_PROMPT]"],
647
+ image_token="[IMG]",
648
+ )
649
+ )
650
+
636
651
  # reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
637
652
  register_conv_template(
638
653
  Conversation(
@@ -852,91 +867,81 @@ register_conv_template(
852
867
  )
853
868
 
854
869
 
870
+ @register_conv_template_matching_function
871
+ def match_internvl(model_path: str):
872
+ if re.search(r"internvl2_5", model_path, re.IGNORECASE):
873
+ return "internvl-2-5"
874
+
875
+
855
876
  @register_conv_template_matching_function
856
877
  def match_llama_3_vision(model_path: str):
857
- if (
858
- "llama" in model_path.lower()
859
- and "3.2" in model_path.lower()
860
- and "vision" in model_path.lower()
861
- ):
878
+ if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
862
879
  return "llama_3_vision"
863
880
 
864
881
 
865
882
  @register_conv_template_matching_function
866
883
  def match_deepseek_janus_pro(model_path: str):
867
- if "janus" in model_path.lower():
884
+ if re.search(r"janus", model_path, re.IGNORECASE):
868
885
  return "janus-pro"
869
886
 
870
887
 
871
888
  @register_conv_template_matching_function
872
889
  def match_vicuna(model_path: str):
873
- if "vicuna" in model_path.lower():
874
- return "vicuna_v1.1"
875
- if "llava-v1.5" in model_path.lower():
876
- return "vicuna_v1.1"
877
- if "llava-next-video-7b" in model_path.lower():
890
+ if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
878
891
  return "vicuna_v1.1"
879
892
 
880
893
 
881
894
  @register_conv_template_matching_function
882
895
  def match_llama2_chat(model_path: str):
883
- model_path = model_path.lower()
884
- if "llama-2" in model_path and "chat" in model_path:
885
- return "llama-2"
886
- if (
887
- "mistral" in model_path or "mixtral" in model_path
888
- ) and "instruct" in model_path:
889
- return "llama-2"
890
- if "codellama" in model_path and "instruct" in model_path:
896
+ if re.search(
897
+ r"llama-2.*chat|codellama.*instruct",
898
+ model_path,
899
+ re.IGNORECASE,
900
+ ):
891
901
  return "llama-2"
892
902
 
893
903
 
904
+ @register_conv_template_matching_function
905
+ def match_mistral(model_path: str):
906
+ if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
907
+ return "mistral"
908
+
909
+
894
910
  @register_conv_template_matching_function
895
911
  def match_deepseek_vl(model_path: str):
896
- model_path = model_path.lower()
897
- if "deepseek" in model_path and "vl2" in model_path:
912
+ if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
898
913
  return "deepseek-vl2"
899
914
 
900
915
 
901
916
  @register_conv_template_matching_function
902
- def match_chat_ml(model_path: str):
903
- # import pdb;pdb.set_trace()
904
- model_path = model_path.lower()
905
- # Now the suffix for qwen2 chat model is "instruct"
906
- if "gme" in model_path and "qwen" in model_path and "vl" in model_path:
917
+ def match_qwen_chat_ml(model_path: str):
918
+ if re.search(r"gme.*qwen.*vl", model_path, re.IGNORECASE):
907
919
  return "gme-qwen2-vl"
908
- if "qwen" in model_path and "vl" in model_path:
920
+ if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
909
921
  return "qwen2-vl"
910
- if (
911
- "llava-v1.6-34b" in model_path
912
- or "llava-v1.6-yi-34b" in model_path
913
- or "llava-next-video-34b" in model_path
914
- or "llava-onevision-qwen2" in model_path
922
+ if re.search(
923
+ r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
924
+ model_path,
925
+ re.IGNORECASE,
915
926
  ):
916
927
  return "chatml-llava"
917
928
 
918
929
 
919
930
  @register_conv_template_matching_function
920
- def match_gemma_it(model_path: str):
921
- model_path = model_path.lower()
922
- if "gemma" in model_path and "it" in model_path:
923
- return "gemma-it"
924
- if "gemma-3" in model_path and "1b" not in model_path:
925
- # gemma-3-1b-it is completion model
931
+ def match_gemma3_instruct(model_path: str):
932
+ if re.search(r"gemma-3.*it", model_path, re.IGNORECASE):
926
933
  return "gemma-it"
927
934
 
928
935
 
929
936
  @register_conv_template_matching_function
930
937
  def match_openbmb_minicpm(model_path: str):
931
- model_path = model_path.lower()
932
- if "minicpm-v" in model_path:
938
+ if re.search(r"minicpm-v", model_path, re.IGNORECASE):
933
939
  return "minicpmv"
934
- elif "minicpm-o" in model_path:
940
+ elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
935
941
  return "minicpmo"
936
942
 
937
943
 
938
944
  @register_conv_template_matching_function
939
945
  def match_moonshot_kimivl(model_path: str):
940
- model_path = model_path.lower()
941
- if "kimi" in model_path and "vl" in model_path:
946
+ if re.search(r"kimi.*vl", model_path, re.IGNORECASE):
942
947
  return "kimi-vl"
@@ -37,6 +37,7 @@ class BaseKVManager(ABC):
37
37
  args: KVArgs,
38
38
  disaggregation_mode: DisaggregationMode,
39
39
  server_args: ServerArgs,
40
+ is_mla_backend: Optional[bool] = False,
40
41
  ): ...
41
42
 
42
43
 
@@ -38,6 +38,7 @@ from sglang.srt.disaggregation.utils import (
38
38
  ReqToMetadataIdxAllocator,
39
39
  TransferBackend,
40
40
  get_kv_class,
41
+ is_mla_backend,
41
42
  kv_to_page_indices,
42
43
  poll_and_all_reduce,
43
44
  )
@@ -87,6 +88,7 @@ class DecodePreallocQueue:
87
88
  self.req_to_token_pool = req_to_token_pool
88
89
  self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
89
90
  self.token_to_kv_pool = token_to_kv_pool_allocator.get_kvcache()
91
+ self.is_mla_backend = is_mla_backend(self.token_to_kv_pool)
90
92
  self.aux_dtype = aux_dtype
91
93
  self.metadata_buffers = metadata_buffers
92
94
  self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
@@ -131,7 +133,10 @@ class DecodePreallocQueue:
131
133
  kv_args.gpu_id = self.scheduler.gpu_id
132
134
  kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
133
135
  kv_manager = kv_manager_class(
134
- kv_args, DisaggregationMode.DECODE, self.scheduler.server_args
136
+ kv_args,
137
+ DisaggregationMode.DECODE,
138
+ self.scheduler.server_args,
139
+ self.is_mla_backend,
135
140
  )
136
141
  return kv_manager
137
142
 
@@ -509,7 +514,7 @@ class SchedulerDisaggregationDecodeMixin:
509
514
  def event_loop_overlap_disagg_decode(self: Scheduler):
510
515
  result_queue = deque()
511
516
  self.last_batch: Optional[ScheduleBatch] = None
512
- self.last_batch_in_queue = False # last batch is modifed in-place, so we need another variable to track if it's extend
517
+ self.last_batch_in_queue = False # last batch is modified in-place, so we need another variable to track if it's extend
513
518
 
514
519
  while True:
515
520
  recv_reqs = self.recv_requests()
@@ -54,7 +54,7 @@ class FakeKVSender(BaseKVSender):
54
54
  logger.info(f"FakeKVSender send success")
55
55
  else:
56
56
  self.has_sent = False
57
- logger.info(f"FakeKVSender send fake transfering")
57
+ logger.info(f"FakeKVSender send fake transferring")
58
58
 
59
59
  def failure_exception(self):
60
60
  raise Exception("Fake KVSender Exception")