sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. sglang/bench_one_batch.py +0 -6
  2. sglang/bench_one_batch_server.py +7 -2
  3. sglang/bench_serving.py +3 -3
  4. sglang/eval/llama3_eval.py +0 -1
  5. sglang/srt/configs/model_config.py +24 -9
  6. sglang/srt/configs/update_config.py +40 -5
  7. sglang/srt/constrained/xgrammar_backend.py +23 -11
  8. sglang/srt/conversation.py +2 -15
  9. sglang/srt/disaggregation/ascend/conn.py +1 -3
  10. sglang/srt/disaggregation/base/conn.py +1 -0
  11. sglang/srt/disaggregation/decode.py +1 -1
  12. sglang/srt/disaggregation/launch_lb.py +7 -1
  13. sglang/srt/disaggregation/mini_lb.py +11 -5
  14. sglang/srt/disaggregation/mooncake/conn.py +141 -47
  15. sglang/srt/disaggregation/prefill.py +261 -5
  16. sglang/srt/disaggregation/utils.py +2 -1
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  18. sglang/srt/distributed/device_communicators/pynccl.py +68 -18
  19. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
  20. sglang/srt/distributed/naive_distributed.py +112 -0
  21. sglang/srt/distributed/parallel_state.py +90 -4
  22. sglang/srt/entrypoints/context.py +20 -1
  23. sglang/srt/entrypoints/engine.py +27 -2
  24. sglang/srt/entrypoints/http_server.py +12 -0
  25. sglang/srt/entrypoints/openai/protocol.py +2 -2
  26. sglang/srt/entrypoints/openai/serving_chat.py +22 -6
  27. sglang/srt/entrypoints/openai/serving_completions.py +9 -1
  28. sglang/srt/entrypoints/openai/serving_responses.py +2 -2
  29. sglang/srt/eplb/expert_distribution.py +2 -3
  30. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  31. sglang/srt/hf_transformers_utils.py +24 -0
  32. sglang/srt/host_shared_memory.py +83 -0
  33. sglang/srt/layers/attention/ascend_backend.py +132 -22
  34. sglang/srt/layers/attention/flashattention_backend.py +24 -17
  35. sglang/srt/layers/attention/flashinfer_backend.py +11 -3
  36. sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
  37. sglang/srt/layers/attention/triton_backend.py +85 -46
  38. sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
  39. sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
  40. sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
  41. sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
  42. sglang/srt/layers/attention/utils.py +94 -15
  43. sglang/srt/layers/attention/vision.py +40 -13
  44. sglang/srt/layers/attention/vision_utils.py +65 -0
  45. sglang/srt/layers/communicator.py +51 -3
  46. sglang/srt/layers/dp_attention.py +23 -4
  47. sglang/srt/layers/elementwise.py +94 -0
  48. sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
  49. sglang/srt/layers/layernorm.py +8 -1
  50. sglang/srt/layers/linear.py +24 -0
  51. sglang/srt/layers/logits_processor.py +5 -1
  52. sglang/srt/layers/moe/__init__.py +31 -0
  53. sglang/srt/layers/moe/ep_moe/layer.py +37 -33
  54. sglang/srt/layers/moe/fused_moe_native.py +14 -25
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
  60. sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
  61. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
  62. sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
  63. sglang/srt/layers/moe/moe_runner/base.py +13 -0
  64. sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
  65. sglang/srt/layers/moe/router.py +15 -9
  66. sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
  67. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
  68. sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
  69. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  70. sglang/srt/layers/moe/topk.py +167 -83
  71. sglang/srt/layers/moe/utils.py +159 -18
  72. sglang/srt/layers/quantization/__init__.py +13 -14
  73. sglang/srt/layers/quantization/awq.py +7 -7
  74. sglang/srt/layers/quantization/base_config.py +2 -6
  75. sglang/srt/layers/quantization/blockwise_int8.py +4 -12
  76. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
  77. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
  78. sglang/srt/layers/quantization/fp8.py +127 -119
  79. sglang/srt/layers/quantization/fp8_kernel.py +195 -24
  80. sglang/srt/layers/quantization/fp8_utils.py +34 -9
  81. sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
  82. sglang/srt/layers/quantization/gptq.py +5 -4
  83. sglang/srt/layers/quantization/marlin_utils.py +11 -3
  84. sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
  85. sglang/srt/layers/quantization/modelopt_quant.py +165 -68
  86. sglang/srt/layers/quantization/moe_wna16.py +10 -15
  87. sglang/srt/layers/quantization/mxfp4.py +206 -37
  88. sglang/srt/layers/quantization/quark/quark.py +390 -0
  89. sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
  90. sglang/srt/layers/quantization/unquant.py +34 -70
  91. sglang/srt/layers/quantization/utils.py +25 -0
  92. sglang/srt/layers/quantization/w4afp8.py +7 -8
  93. sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
  94. sglang/srt/layers/quantization/w8a8_int8.py +5 -13
  95. sglang/srt/layers/radix_attention.py +6 -0
  96. sglang/srt/layers/rotary_embedding.py +1 -0
  97. sglang/srt/lora/lora_manager.py +21 -22
  98. sglang/srt/lora/lora_registry.py +3 -3
  99. sglang/srt/lora/mem_pool.py +26 -24
  100. sglang/srt/lora/utils.py +10 -12
  101. sglang/srt/managers/cache_controller.py +76 -18
  102. sglang/srt/managers/detokenizer_manager.py +10 -2
  103. sglang/srt/managers/io_struct.py +9 -0
  104. sglang/srt/managers/mm_utils.py +1 -1
  105. sglang/srt/managers/schedule_batch.py +4 -9
  106. sglang/srt/managers/scheduler.py +25 -16
  107. sglang/srt/managers/session_controller.py +1 -1
  108. sglang/srt/managers/template_manager.py +7 -5
  109. sglang/srt/managers/tokenizer_manager.py +60 -21
  110. sglang/srt/managers/tp_worker.py +1 -0
  111. sglang/srt/managers/utils.py +59 -1
  112. sglang/srt/mem_cache/allocator.py +7 -5
  113. sglang/srt/mem_cache/allocator_ascend.py +0 -11
  114. sglang/srt/mem_cache/hicache_storage.py +14 -4
  115. sglang/srt/mem_cache/memory_pool.py +3 -3
  116. sglang/srt/mem_cache/memory_pool_host.py +35 -2
  117. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
  118. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
  119. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
  120. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
  121. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
  122. sglang/srt/model_executor/cuda_graph_runner.py +25 -12
  123. sglang/srt/model_executor/forward_batch_info.py +4 -1
  124. sglang/srt/model_executor/model_runner.py +43 -32
  125. sglang/srt/model_executor/npu_graph_runner.py +94 -0
  126. sglang/srt/model_loader/loader.py +24 -6
  127. sglang/srt/models/dbrx.py +12 -6
  128. sglang/srt/models/deepseek.py +2 -1
  129. sglang/srt/models/deepseek_nextn.py +3 -1
  130. sglang/srt/models/deepseek_v2.py +224 -223
  131. sglang/srt/models/ernie4.py +2 -2
  132. sglang/srt/models/glm4_moe.py +25 -63
  133. sglang/srt/models/glm4v.py +52 -1
  134. sglang/srt/models/glm4v_moe.py +8 -11
  135. sglang/srt/models/gpt_oss.py +34 -74
  136. sglang/srt/models/granitemoe.py +0 -1
  137. sglang/srt/models/grok.py +376 -48
  138. sglang/srt/models/interns1.py +12 -47
  139. sglang/srt/models/internvl.py +6 -51
  140. sglang/srt/models/llama4.py +0 -2
  141. sglang/srt/models/minicpm3.py +0 -1
  142. sglang/srt/models/mixtral.py +0 -2
  143. sglang/srt/models/nemotron_nas.py +435 -0
  144. sglang/srt/models/olmoe.py +0 -1
  145. sglang/srt/models/phi4mm.py +3 -21
  146. sglang/srt/models/qwen2_5_vl.py +2 -0
  147. sglang/srt/models/qwen2_moe.py +3 -18
  148. sglang/srt/models/qwen3.py +2 -2
  149. sglang/srt/models/qwen3_classification.py +7 -1
  150. sglang/srt/models/qwen3_moe.py +9 -38
  151. sglang/srt/models/step3_vl.py +2 -1
  152. sglang/srt/models/xverse_moe.py +11 -5
  153. sglang/srt/multimodal/processors/base_processor.py +3 -3
  154. sglang/srt/multimodal/processors/internvl.py +7 -2
  155. sglang/srt/multimodal/processors/llava.py +11 -7
  156. sglang/srt/offloader.py +433 -0
  157. sglang/srt/operations.py +6 -1
  158. sglang/srt/reasoning_parser.py +4 -3
  159. sglang/srt/server_args.py +237 -104
  160. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
  161. sglang/srt/speculative/eagle_utils.py +36 -13
  162. sglang/srt/speculative/eagle_worker.py +56 -3
  163. sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
  164. sglang/srt/two_batch_overlap.py +16 -11
  165. sglang/srt/utils.py +68 -70
  166. sglang/test/runners.py +8 -5
  167. sglang/test/test_block_fp8.py +5 -6
  168. sglang/test/test_block_fp8_ep.py +13 -19
  169. sglang/test/test_cutlass_moe.py +4 -6
  170. sglang/test/test_cutlass_w4a8_moe.py +4 -3
  171. sglang/test/test_fp4_moe.py +4 -3
  172. sglang/test/test_utils.py +7 -0
  173. sglang/utils.py +0 -1
  174. sglang/version.py +1 -1
  175. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
  176. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
  177. sglang/srt/layers/quantization/fp4.py +0 -557
  178. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
  179. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
  180. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -61,7 +61,6 @@ from sglang.srt.configs.model_config import ModelConfig
61
61
  from sglang.srt.distributed.parallel_state import destroy_distributed_environment
62
62
  from sglang.srt.entrypoints.engine import _set_envs_and_config
63
63
  from sglang.srt.hf_transformers_utils import get_tokenizer
64
- from sglang.srt.layers.moe.utils import DeepEPMode, MoeA2ABackend
65
64
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
66
65
  from sglang.srt.managers.scheduler import Scheduler
67
66
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -300,11 +299,6 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
300
299
  disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
301
300
  spec_algorithm=SpeculativeAlgorithm.NONE,
302
301
  speculative_num_draft_tokens=None,
303
- enable_two_batch_overlap=model_runner.server_args.enable_two_batch_overlap,
304
- enable_deepep_moe=MoeA2ABackend(
305
- model_runner.server_args.moe_a2a_backend
306
- ).is_deepep(),
307
- deepep_mode=DeepEPMode(model_runner.server_args.deepep_mode),
308
302
  require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
309
303
  disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
310
304
  )
@@ -26,7 +26,7 @@ from sglang.bench_serving import get_tokenizer, sample_random_requests
26
26
  from sglang.profiler import run_profile
27
27
  from sglang.srt.entrypoints.http_server import launch_server
28
28
  from sglang.srt.server_args import ServerArgs
29
- from sglang.srt.utils import kill_process_tree
29
+ from sglang.srt.utils import is_blackwell, kill_process_tree
30
30
  from sglang.test.test_utils import is_in_ci, write_github_step_summary
31
31
 
32
32
 
@@ -363,7 +363,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
363
363
  acc_length,
364
364
  trace_link,
365
365
  ) in result:
366
- hourly_cost = 2 * server_args.tp_size # $2/hour for one H100
366
+ if is_blackwell():
367
+ hourly_cost_per_gpu = 4 # $4/hour for one B200
368
+ else:
369
+ hourly_cost_per_gpu = 2 # $2/hour for one H100
370
+
371
+ hourly_cost = hourly_cost_per_gpu * server_args.tp_size
367
372
  input_util = 0.7
368
373
  accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
369
374
  line = (
sglang/bench_serving.py CHANGED
@@ -864,11 +864,11 @@ def sample_mmmu_requests(
864
864
  if image.mode == "RGBA":
865
865
  image = image.convert("RGB")
866
866
 
867
- # Encode image to base64
867
+ # Encode image to base64 (save as PNG to support palette/alpha modes)
868
868
  buffered = io.BytesIO()
869
- image.save(buffered, format="JPEG")
869
+ image.save(buffered, format="PNG")
870
870
  img_str = pybase64.b64encode(buffered.getvalue()).decode("utf-8")
871
- image_data = f"data:image/jpeg;base64,{img_str}"
871
+ image_data = f"data:image/png;base64,{img_str}"
872
872
  else:
873
873
  continue
874
874
 
@@ -12,7 +12,6 @@ from dataclasses import dataclass
12
12
  import httpx
13
13
  import numpy as np
14
14
  import openai
15
- import transformers
16
15
  from datasets import load_dataset
17
16
  from openai import AsyncOpenAI
18
17
  from tqdm import tqdm
@@ -32,6 +32,7 @@ from sglang.srt.hf_transformers_utils import (
32
32
  from sglang.srt.layers.quantization import QUANTIZATION_METHODS
33
33
  from sglang.srt.server_args import ServerArgs
34
34
  from sglang.srt.utils import get_bool_env_var, is_hip
35
+ from sglang.utils import is_in_ci
35
36
 
36
37
  logger = logging.getLogger(__name__)
37
38
 
@@ -166,19 +167,20 @@ class ModelConfig:
166
167
  derived_context_len = get_context_length(self.hf_text_config)
167
168
  if context_length is not None:
168
169
  if context_length > derived_context_len:
169
- if get_bool_env_var(
170
- "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", default="True"
170
+ reason = "Target model's" if is_draft_model else "User-specified"
171
+ msg = (
172
+ f"Warning: {reason} context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
173
+ f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
174
+ )
175
+ if (
176
+ get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN")
177
+ or is_in_ci() # FIXME: fix this special case
171
178
  ):
172
- logger.warning(
173
- f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
174
- f"This may lead to incorrect model outputs or CUDA errors."
175
- )
179
+ logger.warning(msg)
176
180
  self.context_len = context_length
177
181
  else:
178
182
  raise ValueError(
179
- f"User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
180
- f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config. "
181
- f"To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
183
+ f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
182
184
  )
183
185
  else:
184
186
  self.context_len = context_length
@@ -341,6 +343,19 @@ class ModelConfig:
341
343
  "kv_n_heads",
342
344
  self.hf_config.num_attention_heads,
343
345
  )
346
+ if self.hf_config.model_type in ["nemotron-nas"]:
347
+ nkvh = {
348
+ self.hf_config.num_attention_heads // block.attention.n_heads_in_group
349
+ for block in self.hf_config.block_configs
350
+ if not block.attention.no_op
351
+ }
352
+ if len(nkvh) == 0:
353
+ raise RuntimeError("Couldn't determine number of kv heads")
354
+ if len(nkvh) > 1:
355
+ raise ValueError(
356
+ "Variable GQA (VGQA) is not yet supported for nemotron-nas in sglang"
357
+ )
358
+ return next(iter(nkvh))
344
359
 
345
360
  attributes = [
346
361
  # For Falcon:
@@ -49,14 +49,25 @@ def get_num_heads_padding_size(tp_size, weight_block_size):
49
49
 
50
50
 
51
51
  def update_intermediate_size(model_config, attr_name, intermediate_padding_size):
52
- if hasattr(model_config.hf_config, attr_name):
52
+ attr_value = intermediate_padding_size
53
+ if hasattr(model_config, "hf_config") and hasattr(
54
+ model_config.hf_config, attr_name
55
+ ):
53
56
  attr_value = getattr(model_config.hf_config, attr_name)
54
- if attr_value % intermediate_padding_size != 0:
55
- from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
57
+ elif hasattr(model_config, attr_name):
58
+ attr_value = getattr(model_config, attr_name)
59
+
60
+ if attr_value % intermediate_padding_size != 0:
61
+ from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
56
62
 
57
- attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
63
+ attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
64
+ if hasattr(model_config, "hf_config"):
58
65
  setattr(model_config.hf_config, attr_name, attr_value)
59
- setattr(model_config.hf_text_config, attr_name, attr_value)
66
+ if hasattr(model_config, "hf_text_config"):
67
+ setattr(model_config.hf_text_config, attr_name, attr_value)
68
+ else:
69
+ setattr(model_config, attr_name, attr_value)
70
+
60
71
  return model_config
61
72
 
62
73
 
@@ -118,4 +129,28 @@ def adjust_config_with_unaligned_cpu_tp(
118
129
  model_config = update_intermediate_size(
119
130
  model_config, "intermediate_size_mlp", intermediate_padding_size
120
131
  )
132
+ if (
133
+ hasattr(model_config.hf_config, "vision_config")
134
+ and model_config.hf_config.vision_config.model_type == "siglip_vision_model"
135
+ ):
136
+ model_config.hf_config.vision_config.original_num_attention_heads = (
137
+ model_config.num_attention_heads
138
+ )
139
+ if model_config.hf_config.vision_config.num_attention_heads % tp_size != 0:
140
+ model_config.hf_config.vision_config.head_dim = (
141
+ model_config.hf_config.vision_config.hidden_size
142
+ // model_config.hf_config.vision_config.num_attention_heads
143
+ )
144
+ from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
145
+
146
+ pad_size = get_num_heads_padding_size(tp_size, weight_block_size)
147
+ model_config.hf_config.vision_config.num_attention_heads = pad_vocab_size(
148
+ model_config.hf_config.vision_config.num_attention_heads, pad_size
149
+ )
150
+ model_config.hf_config.vision_config = update_intermediate_size(
151
+ model_config.hf_config.vision_config,
152
+ "intermediate_size",
153
+ intermediate_padding_size,
154
+ )
155
+
121
156
  return model_config
@@ -32,10 +32,15 @@ from sglang.srt.constrained.base_grammar_backend import (
32
32
  BaseGrammarBackend,
33
33
  BaseGrammarObject,
34
34
  )
35
- from sglang.srt.constrained.triton_ops.bitmask_ops import (
36
- apply_token_bitmask_inplace_triton,
37
- )
38
-
35
+ from sglang.srt.utils import is_hip
36
+
37
+ _is_hip = is_hip()
38
+ if _is_hip:
39
+ from sgl_kernel import apply_token_bitmask_inplace_cuda
40
+ else:
41
+ from sglang.srt.constrained.triton_ops.bitmask_ops import (
42
+ apply_token_bitmask_inplace_triton,
43
+ )
39
44
  logger = logging.getLogger(__name__)
40
45
 
41
46
 
@@ -94,7 +99,10 @@ class XGrammarGrammar(BaseGrammarObject):
94
99
 
95
100
  def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
96
101
  if logits.device.type == "cuda":
97
- apply_token_bitmask_inplace_triton(logits, vocab_mask)
102
+ if _is_hip:
103
+ apply_token_bitmask_inplace_cuda(logits, vocab_mask)
104
+ else:
105
+ apply_token_bitmask_inplace_triton(logits, vocab_mask)
98
106
  elif logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
99
107
  self.apply_vocab_mask_cpu(logits, vocab_mask)
100
108
  else:
@@ -154,12 +162,16 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
154
162
  ):
155
163
  super().__init__()
156
164
 
157
- # Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
158
- # This ensures consistency between what the model considers EOS and what XGrammar uses
159
- tokenizer_info = TokenizerInfo.from_huggingface(
160
- tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
161
- )
162
- override_stop_tokens = None
165
+ if hasattr(tokenizer, "init_xgrammar"):
166
+ # For special tokenizer
167
+ tokenizer_info, override_stop_tokens = tokenizer.init_xgrammar()
168
+ else:
169
+ # Create TokenizerInfo with model's EOS tokens as the authoritative stop tokens
170
+ # This ensures consistency between what the model considers EOS and what XGrammar uses
171
+ tokenizer_info = TokenizerInfo.from_huggingface(
172
+ tokenizer, vocab_size=vocab_size, stop_token_ids=model_eos_token_ids
173
+ )
174
+ override_stop_tokens = None
163
175
 
164
176
  self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
165
177
  self.vocab_size = vocab_size
@@ -625,7 +625,7 @@ def generate_chat_conv(
625
625
  real_content += content.text
626
626
  elif content.type == "image_url":
627
627
  # NOTE: works for llava and intervl2_5
628
- if conv.name in ["internvl-2-5", "interns1"]:
628
+ if conv.name in ["internvl-2-5"]:
629
629
  real_content = image_token + real_content
630
630
  else:
631
631
  real_content += image_token
@@ -817,20 +817,7 @@ register_conv_template(
817
817
  sep_style=SeparatorStyle.MPT,
818
818
  sep="<|im_end|>\n",
819
819
  stop_str=["<|im_end|>", "<|action_end|>"],
820
- image_token="<image>",
821
- )
822
- )
823
-
824
- register_conv_template(
825
- Conversation(
826
- name="interns1",
827
- system_template="<|im_start|>system\n{system_message}",
828
- system_message="You are an AI assistant whose name is Intern-S1 (书生大模型).\n- Intern-S1 (书生大模型) is a vision-language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n- Intern-S1 (书生大模型) can understand and communicate fluently in the language chosen by the user such as English and 中文.\nYou are an expert reasoner with extensive experience in all areas. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.",
829
- roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
830
- sep_style=SeparatorStyle.MPT,
831
- sep="<|im_end|>\n",
832
- stop_str=["<|im_end|>", "<|action_end|>"],
833
- image_token="<image>",
820
+ image_token="<IMG_CONTEXT>",
834
821
  )
835
822
  )
836
823
 
@@ -23,9 +23,7 @@ class AscendKVManager(MooncakeKVManager):
23
23
  )
24
24
 
25
25
  def register_buffer_to_engine(self):
26
- self.engine.register(
27
- self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
28
- )
26
+ self.engine.batch_register(self.kv_args.kv_data_ptrs, self.kv_args.kv_data_lens)
29
27
  # The Ascend backend optimize batch registration for small memory blocks.
30
28
  self.engine.batch_register(
31
29
  self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
@@ -30,6 +30,7 @@ class KVArgs:
30
30
  # for pp prefill
31
31
  prefill_pp_size: int
32
32
  pp_rank: int
33
+ prefill_start_layer: int
33
34
  # for system dp
34
35
  system_dp_rank: int
35
36
 
@@ -259,7 +259,7 @@ class DecodePreallocQueue:
259
259
  if len(req.origin_input_ids) > self.max_total_num_tokens:
260
260
  message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
261
261
  logger.error(message)
262
- prepare_abort(req, message)
262
+ prepare_abort(req, message, status_code=HTTPStatus.BAD_REQUEST)
263
263
  self.scheduler.stream_output([req], req.return_logprob)
264
264
  return True
265
265
  return False
@@ -118,7 +118,13 @@ def main():
118
118
  lb_args = LBArgs.from_cli_args(args)
119
119
 
120
120
  prefill_configs = [PrefillConfig(url, port) for url, port in lb_args.prefill_infos]
121
- run(prefill_configs, lb_args.decode_infos, lb_args.host, lb_args.port)
121
+ run(
122
+ prefill_configs,
123
+ lb_args.decode_infos,
124
+ lb_args.host,
125
+ lb_args.port,
126
+ lb_args.timeout,
127
+ )
122
128
 
123
129
 
124
130
  if __name__ == "__main__":
@@ -50,10 +50,16 @@ class PrefillConfig:
50
50
 
51
51
 
52
52
  class MiniLoadBalancer:
53
- def __init__(self, prefill_configs: List[PrefillConfig], decode_servers: List[str]):
53
+ def __init__(
54
+ self,
55
+ prefill_configs: List[PrefillConfig],
56
+ decode_servers: List[str],
57
+ timeout: int,
58
+ ):
54
59
  self.prefill_configs = prefill_configs
55
60
  self.prefill_servers = [p.url for p in prefill_configs]
56
61
  self.decode_servers = decode_servers
62
+ self.timeout = timeout
57
63
 
58
64
  def add_prefill_server(self, new_prefill_config: PrefillConfig):
59
65
  self.prefill_configs.append(new_prefill_config)
@@ -78,7 +84,7 @@ class MiniLoadBalancer:
78
84
 
79
85
  async with aiohttp.ClientSession(
80
86
  timeout=aiohttp.ClientTimeout(
81
- total=3600
87
+ total=self.timeout
82
88
  ) # Add timeout for request reliability
83
89
  ) as session:
84
90
  tasks = [
@@ -117,7 +123,7 @@ class MiniLoadBalancer:
117
123
  async def stream_results():
118
124
  async with aiohttp.ClientSession(
119
125
  timeout=aiohttp.ClientTimeout(
120
- total=3600
126
+ total=self.timeout
121
127
  ) # Add timeout for request reliability
122
128
  ) as session:
123
129
  # Create the tasks for both prefill and decode requests
@@ -401,9 +407,9 @@ async def register(obj: PDRegistryRequest):
401
407
  return Response(status_code=200)
402
408
 
403
409
 
404
- def run(prefill_configs, decode_addrs, host, port):
410
+ def run(prefill_configs, decode_addrs, host, port, timeout):
405
411
  global load_balancer
406
- load_balancer = MiniLoadBalancer(prefill_configs, decode_addrs)
412
+ load_balancer = MiniLoadBalancer(prefill_configs, decode_addrs, timeout=timeout)
407
413
  uvicorn.run(app, host=host, port=port)
408
414
 
409
415