sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. sglang/bench_one_batch.py +2 -1
  2. sglang/eval/loogle_eval.py +7 -0
  3. sglang/srt/configs/deepseekvl2.py +11 -2
  4. sglang/srt/configs/internvl.py +3 -0
  5. sglang/srt/configs/janus_pro.py +3 -0
  6. sglang/srt/configs/model_config.py +9 -7
  7. sglang/srt/configs/update_config.py +3 -1
  8. sglang/srt/conversation.py +1 -0
  9. sglang/srt/custom_op.py +5 -2
  10. sglang/srt/disaggregation/decode.py +9 -1
  11. sglang/srt/disaggregation/mooncake/conn.py +44 -56
  12. sglang/srt/distributed/parallel_state.py +33 -0
  13. sglang/srt/entrypoints/engine.py +30 -26
  14. sglang/srt/entrypoints/openai/serving_chat.py +21 -2
  15. sglang/srt/eplb/expert_location_dispatch.py +1 -1
  16. sglang/srt/function_call/function_call_parser.py +2 -0
  17. sglang/srt/function_call/qwen3_detector.py +150 -0
  18. sglang/srt/hf_transformers_utils.py +0 -1
  19. sglang/srt/layers/activation.py +13 -0
  20. sglang/srt/layers/attention/flashattention_backend.py +3 -3
  21. sglang/srt/layers/attention/flashinfer_backend.py +40 -1
  22. sglang/srt/layers/linear.py +13 -102
  23. sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
  24. sglang/srt/layers/moe/ep_moe/layer.py +23 -402
  25. sglang/srt/layers/moe/fused_moe_native.py +7 -47
  26. sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
  27. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
  33. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
  34. sglang/srt/layers/moe/topk.py +187 -12
  35. sglang/srt/layers/quantization/__init__.py +20 -134
  36. sglang/srt/layers/quantization/awq.py +578 -11
  37. sglang/srt/layers/quantization/awq_triton.py +339 -0
  38. sglang/srt/layers/quantization/base_config.py +85 -10
  39. sglang/srt/layers/quantization/blockwise_int8.py +17 -55
  40. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
  41. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
  42. sglang/srt/layers/quantization/fp8.py +273 -62
  43. sglang/srt/layers/quantization/fp8_kernel.py +210 -46
  44. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  45. sglang/srt/layers/quantization/gptq.py +501 -143
  46. sglang/srt/layers/quantization/marlin_utils.py +790 -0
  47. sglang/srt/layers/quantization/modelopt_quant.py +26 -108
  48. sglang/srt/layers/quantization/moe_wna16.py +45 -49
  49. sglang/srt/layers/quantization/petit.py +252 -0
  50. sglang/srt/layers/quantization/petit_utils.py +104 -0
  51. sglang/srt/layers/quantization/qoq.py +7 -6
  52. sglang/srt/layers/quantization/scalar_type.py +352 -0
  53. sglang/srt/layers/quantization/unquant.py +422 -0
  54. sglang/srt/layers/quantization/utils.py +343 -3
  55. sglang/srt/layers/quantization/w4afp8.py +8 -4
  56. sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
  57. sglang/srt/layers/quantization/w8a8_int8.py +51 -115
  58. sglang/srt/layers/vocab_parallel_embedding.py +1 -41
  59. sglang/srt/lora/lora.py +0 -4
  60. sglang/srt/lora/lora_manager.py +87 -53
  61. sglang/srt/lora/mem_pool.py +81 -33
  62. sglang/srt/lora/utils.py +12 -5
  63. sglang/srt/managers/cache_controller.py +241 -0
  64. sglang/srt/managers/io_struct.py +41 -29
  65. sglang/srt/managers/mm_utils.py +7 -8
  66. sglang/srt/managers/schedule_batch.py +150 -110
  67. sglang/srt/managers/schedule_policy.py +68 -27
  68. sglang/srt/managers/scheduler.py +243 -61
  69. sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
  70. sglang/srt/managers/tokenizer_manager.py +11 -3
  71. sglang/srt/managers/tp_worker.py +14 -0
  72. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  73. sglang/srt/mem_cache/allocator.py +7 -16
  74. sglang/srt/mem_cache/base_prefix_cache.py +14 -2
  75. sglang/srt/mem_cache/chunk_cache.py +5 -2
  76. sglang/srt/mem_cache/hicache_storage.py +152 -0
  77. sglang/srt/mem_cache/hiradix_cache.py +179 -4
  78. sglang/srt/mem_cache/memory_pool.py +16 -1
  79. sglang/srt/mem_cache/memory_pool_host.py +41 -2
  80. sglang/srt/mem_cache/radix_cache.py +26 -0
  81. sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
  82. sglang/srt/metrics/collector.py +9 -0
  83. sglang/srt/model_executor/cuda_graph_runner.py +5 -6
  84. sglang/srt/model_executor/forward_batch_info.py +14 -1
  85. sglang/srt/model_executor/model_runner.py +109 -22
  86. sglang/srt/model_loader/loader.py +7 -1
  87. sglang/srt/model_loader/utils.py +4 -4
  88. sglang/srt/models/clip.py +1 -1
  89. sglang/srt/models/deepseek.py +9 -6
  90. sglang/srt/models/deepseek_janus_pro.py +1 -1
  91. sglang/srt/models/deepseek_v2.py +191 -171
  92. sglang/srt/models/deepseek_vl2.py +5 -5
  93. sglang/srt/models/gemma.py +48 -0
  94. sglang/srt/models/gemma2.py +52 -0
  95. sglang/srt/models/gemma3_causal.py +63 -0
  96. sglang/srt/models/gemma3_mm.py +1 -1
  97. sglang/srt/models/gemma3n_mm.py +2 -4
  98. sglang/srt/models/granitemoe.py +385 -0
  99. sglang/srt/models/grok.py +9 -3
  100. sglang/srt/models/hunyuan.py +63 -16
  101. sglang/srt/models/internvl.py +1 -1
  102. sglang/srt/models/kimi_vl.py +1 -1
  103. sglang/srt/models/llama.py +41 -0
  104. sglang/srt/models/llama4.py +11 -11
  105. sglang/srt/models/llava.py +2 -2
  106. sglang/srt/models/llavavid.py +1 -1
  107. sglang/srt/models/minicpm.py +0 -2
  108. sglang/srt/models/minicpmo.py +3 -7
  109. sglang/srt/models/minicpmv.py +1 -1
  110. sglang/srt/models/mistral.py +1 -1
  111. sglang/srt/models/mixtral.py +9 -2
  112. sglang/srt/models/mllama.py +3 -5
  113. sglang/srt/models/mllama4.py +3 -3
  114. sglang/srt/models/olmoe.py +8 -5
  115. sglang/srt/models/persimmon.py +330 -0
  116. sglang/srt/models/phi.py +321 -0
  117. sglang/srt/models/phi4mm.py +44 -4
  118. sglang/srt/models/phi4mm_audio.py +1260 -0
  119. sglang/srt/models/phi4mm_utils.py +1917 -0
  120. sglang/srt/models/phimoe.py +9 -3
  121. sglang/srt/models/qwen.py +37 -0
  122. sglang/srt/models/qwen2.py +41 -0
  123. sglang/srt/models/qwen2_5_vl.py +4 -4
  124. sglang/srt/models/qwen2_audio.py +1 -1
  125. sglang/srt/models/qwen2_moe.py +53 -5
  126. sglang/srt/models/qwen2_vl.py +4 -4
  127. sglang/srt/models/qwen3.py +65 -1
  128. sglang/srt/models/qwen3_moe.py +56 -18
  129. sglang/srt/models/vila.py +1 -1
  130. sglang/srt/multimodal/processors/base_processor.py +91 -97
  131. sglang/srt/multimodal/processors/clip.py +21 -19
  132. sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
  133. sglang/srt/multimodal/processors/gemma3.py +13 -17
  134. sglang/srt/multimodal/processors/gemma3n.py +19 -23
  135. sglang/srt/multimodal/processors/internvl.py +9 -10
  136. sglang/srt/multimodal/processors/janus_pro.py +12 -27
  137. sglang/srt/multimodal/processors/kimi_vl.py +12 -14
  138. sglang/srt/multimodal/processors/llava.py +4 -2
  139. sglang/srt/multimodal/processors/minicpm.py +35 -44
  140. sglang/srt/multimodal/processors/mlama.py +21 -18
  141. sglang/srt/multimodal/processors/mllama4.py +4 -5
  142. sglang/srt/multimodal/processors/phi4mm.py +63 -39
  143. sglang/srt/multimodal/processors/pixtral.py +14 -35
  144. sglang/srt/multimodal/processors/qwen_audio.py +65 -0
  145. sglang/srt/multimodal/processors/qwen_vl.py +16 -21
  146. sglang/srt/multimodal/processors/vila.py +14 -14
  147. sglang/srt/sampling/sampling_params.py +8 -1
  148. sglang/srt/server_args.py +393 -230
  149. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
  150. sglang/srt/two_batch_overlap.py +1 -0
  151. sglang/srt/utils.py +27 -1
  152. sglang/test/runners.py +14 -3
  153. sglang/test/test_block_fp8.py +8 -3
  154. sglang/test/test_block_fp8_ep.py +1 -1
  155. sglang/test/test_custom_ops.py +12 -7
  156. sglang/test/test_cutlass_w4a8_moe.py +1 -3
  157. sglang/test/test_fp4_moe.py +1 -3
  158. sglang/test/test_marlin_moe.py +286 -0
  159. sglang/test/test_marlin_utils.py +171 -0
  160. sglang/test/test_utils.py +35 -0
  161. sglang/version.py +1 -1
  162. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
  163. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
  164. sglang/srt/layers/quantization/quant_utils.py +0 -166
  165. sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
  166. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
  167. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
  168. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0
@@ -145,6 +145,7 @@ class SchedulerStats:
145
145
  num_prefill_infight_queue_reqs: int = 0
146
146
  num_decode_prealloc_queue_reqs: int = 0
147
147
  num_decode_transfer_queue_reqs: int = 0
148
+ total_retracted_reqs: int = 0
148
149
 
149
150
 
150
151
  class SchedulerMetricsCollector:
@@ -219,6 +220,13 @@ class SchedulerMetricsCollector:
219
220
  multiprocess_mode="mostrecent",
220
221
  )
221
222
 
223
+ self.total_retracted_reqs = Gauge(
224
+ name="sglang:total_retracted_reqs",
225
+ documentation="The total number of retracted requests due to kvcache full.",
226
+ labelnames=labels.keys(),
227
+ multiprocess_mode="mostrecent",
228
+ )
229
+
222
230
  # Disaggregation queue metrics
223
231
  self.num_prefill_prealloc_queue_reqs = Gauge(
224
232
  name="sglang:num_prefill_prealloc_queue_reqs",
@@ -279,6 +287,7 @@ class SchedulerMetricsCollector:
279
287
  self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
280
288
  self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
281
289
  self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
290
+ self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
282
291
 
283
292
  # Disaggregation metrics
284
293
  self._log_gauge(
@@ -264,7 +264,7 @@ class CudaGraphRunner:
264
264
  if self.enable_torch_compile:
265
265
  set_torch_compile_config()
266
266
 
267
- if self.model_runner.server_args.lora_paths is not None:
267
+ if self.model_runner.server_args.enable_lora:
268
268
  self.model_runner.lora_manager.init_cuda_graph_batch_info(self.max_bs)
269
269
 
270
270
  # Graph inputs
@@ -510,11 +510,10 @@ class CudaGraphRunner:
510
510
  spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
511
511
  )
512
512
 
513
- if self.model_runner.server_args.lora_paths is not None:
514
- # Currently, if the lora_path in `lora_paths` is None, the lora backend will use a
515
- # different logic to handle lora, so we need to set `lora_paths` to a list of non-None
516
- # values if lora is enabled.
517
- lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs
513
+ if self.model_runner.server_args.enable_lora:
514
+ # It is safe to capture CUDA graph using empty LoRA path, as the LoRA kernels will always be launched whenever
515
+ # `--enable-lora` is set to True (and return immediately if the LoRA path is empty for perf optimization).
516
+ lora_paths = [None] * bs
518
517
  else:
519
518
  lora_paths = None
520
519
 
@@ -68,6 +68,8 @@ class ForwardMode(IntEnum):
68
68
  MIXED = auto()
69
69
  # No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
70
70
  IDLE = auto()
71
+ # Split Prefill for PD multiplexing
72
+ SPLIT_PREFILL = auto()
71
73
 
72
74
  # Used in speculative decoding: verify a batch in the target model.
73
75
  TARGET_VERIFY = auto()
@@ -95,6 +97,9 @@ class ForwardMode(IntEnum):
95
97
  def is_mixed(self):
96
98
  return self == ForwardMode.MIXED
97
99
 
100
+ def is_split_prefill(self):
101
+ return self == ForwardMode.SPLIT_PREFILL
102
+
98
103
  def is_idle(self):
99
104
  return self == ForwardMode.IDLE
100
105
 
@@ -194,6 +199,14 @@ class ForwardBatch:
194
199
  extend_logprob_start_lens_cpu: Optional[List[int]] = None
195
200
  extend_input_logprob_token_ids_gpu: Optional[torch.Tensor] = None
196
201
 
202
+ # For split prefill
203
+ # intermediate values for split prefill
204
+ hidden_states: torch.Tensor = None
205
+ residual: torch.Tensor = None
206
+ model_specific_states: Dict[str, any] = None
207
+ # current split index of layer
208
+ split_index: int = 0
209
+
197
210
  # For MLA chunked prefix cache used in chunked prefill
198
211
  # Tell attention backend whether the kv cache needs to be attended in current pass
199
212
  attn_attend_prefix_cache: Optional[bool] = None
@@ -405,7 +418,7 @@ class ForwardBatch:
405
418
  ret._compute_mrope_positions(model_runner, batch)
406
419
 
407
420
  # Init lora information
408
- if model_runner.server_args.lora_paths is not None:
421
+ if model_runner.server_args.enable_lora:
409
422
  model_runner.lora_manager.prepare_lora_batch(ret)
410
423
 
411
424
  TboForwardBatchPreparer.prepare(
@@ -275,6 +275,15 @@ class ModelRunner:
275
275
  self.sampler = Sampler()
276
276
  self.load_model()
277
277
 
278
+ if (
279
+ not self.server_args.disable_hybrid_swa_memory
280
+ and self.sliding_window_size is not None
281
+ and self.sliding_window_size > 0
282
+ ):
283
+ architectures = self.model_config.hf_config.architectures
284
+ if architectures and not any("Llama4" in arch for arch in architectures):
285
+ self.is_hybrid = self.model_config.is_hybrid = True
286
+
278
287
  self.start_layer = getattr(self.model, "start_layer", 0)
279
288
  self.end_layer = getattr(
280
289
  self.model, "end_layer", self.model_config.num_hidden_layers
@@ -295,11 +304,7 @@ class ModelRunner:
295
304
  self.apply_torch_tp()
296
305
 
297
306
  # Init lora
298
- # TODO (lifuhuang): when we support dynamic LoRA loading / unloading, we should add
299
- # a new server arg `enable_lora` to control whether to init LoRA manager to be more
300
- # explicit, as it is perfectly valid to start a server with an empty lora_paths and
301
- # load LoRA adapters dynamically later.
302
- if server_args.lora_paths is not None:
307
+ if server_args.enable_lora:
303
308
  self.init_lora_manager()
304
309
 
305
310
  # Init memory pool and attention backends
@@ -402,7 +407,7 @@ class ModelRunner:
402
407
  else:
403
408
  server_args.attention_backend = "triton"
404
409
  logger.info(
405
- f"Attention backend not set. Use {server_args.attention_backend} backend by default."
410
+ f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default."
406
411
  )
407
412
  elif self.use_mla_backend:
408
413
  if server_args.device != "cpu":
@@ -454,7 +459,7 @@ class ModelRunner:
454
459
  if not self.is_multimodal_chunked_prefill_supported:
455
460
  server_args.chunked_prefill_size = -1
456
461
  logger.info(
457
- f"Automatically turn of --chunked-prefill-size as it is not supported for "
462
+ f"Automatically turn off --chunked-prefill-size as it is not supported for "
458
463
  f"{self.model_config.hf_config.model_type}"
459
464
  )
460
465
 
@@ -471,10 +476,6 @@ class ModelRunner:
471
476
  if self.model_config.context_len > 8192:
472
477
  self.mem_fraction_static *= 0.85
473
478
 
474
- if self.is_hybrid and not server_args.disable_radix_cache:
475
- logger.info("Automatically disable radix cache for hybrid cache.")
476
- server_args.disable_radix_cache = True
477
-
478
479
  def init_torch_distributed(self):
479
480
  logger.info("Init torch distributed begin.")
480
481
 
@@ -534,6 +535,7 @@ class ModelRunner:
534
535
  initialize_model_parallel(
535
536
  tensor_model_parallel_size=self.tp_size,
536
537
  pipeline_model_parallel_size=self.pp_size,
538
+ duplicate_tp_group=self.server_args.enable_pdmux,
537
539
  )
538
540
  initialize_dp_attention(
539
541
  enable_dp_attention=self.server_args.enable_dp_attention,
@@ -555,7 +557,7 @@ class ModelRunner:
555
557
 
556
558
  # Check memory for tensor parallelism
557
559
  local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
558
- if self.tp_size > 1:
560
+ if self.tp_size > 1 and not self.is_draft_worker:
559
561
  if min_per_gpu_memory < local_gpu_memory * 0.9:
560
562
  if get_bool_env_var("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK"):
561
563
  logger.warning(
@@ -645,11 +647,15 @@ class ModelRunner:
645
647
  )
646
648
 
647
649
  # Parse other args
648
- self.sliding_window_size = (
649
- self.model.get_attention_sliding_window_size()
650
- if hasattr(self.model, "get_attention_sliding_window_size")
651
- else None
652
- )
650
+ self.sliding_window_size = None
651
+ if hasattr(self.model, "get_attention_sliding_window_size"):
652
+ self.sliding_window_size = self.model.get_attention_sliding_window_size()
653
+ elif self.model_config.attention_chunk_size is not None:
654
+ self.sliding_window_size = self.model_config.attention_chunk_size
655
+ print(
656
+ f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}"
657
+ )
658
+
653
659
  self.dtype = self.model_config.dtype
654
660
 
655
661
  after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
@@ -882,8 +888,10 @@ class ModelRunner:
882
888
  lora_backend=self.server_args.lora_backend,
883
889
  tp_size=self.tp_size,
884
890
  tp_rank=self.tp_rank,
891
+ max_lora_rank=self.server_args.max_lora_rank,
892
+ target_modules=self.server_args.lora_target_modules,
885
893
  )
886
- result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths)
894
+ result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths or {})
887
895
  if result.success:
888
896
  logger.info(
889
897
  f"LoRA manager ready. Loaded LoRA adapters: {', '.join(result.loaded_adapters)}"
@@ -992,8 +1000,53 @@ class ModelRunner:
992
1000
  )
993
1001
  self.max_total_num_tokens = self.full_max_total_num_tokens
994
1002
  else:
995
- raise ValueError(
996
- f"Unsupported model for hybrid cache: {self.model_config.hf_config.architectures}."
1003
+ assert self.sliding_window_size is not None and self.sliding_window_size > 0
1004
+ full_attention_layer_ids = []
1005
+ swa_attention_layer_ids = []
1006
+
1007
+ try:
1008
+ layers = self.model.model.layers
1009
+ except:
1010
+ try:
1011
+ layers = self.model.language_model.model.layers
1012
+ except:
1013
+ self.is_hybrid = False
1014
+ return
1015
+
1016
+ for layer in layers:
1017
+ if (
1018
+ layer.self_attn.attn.sliding_window_size is None
1019
+ or layer.self_attn.attn.sliding_window_size == -1
1020
+ ):
1021
+ full_attention_layer_ids.append(layer.layer_id)
1022
+ else:
1023
+ swa_attention_layer_ids.append(layer.layer_id)
1024
+ self.model_config.swa_attention_layer_ids = swa_attention_layer_ids
1025
+ self.model_config.full_attention_layer_ids = full_attention_layer_ids
1026
+
1027
+ # Algorithm:
1028
+ # Existing max_total_num_tokens is per layer and assume all layers have the same number of tokens.
1029
+ # - Find total # of tokens available across layers.
1030
+ # - Calculate full_max_total_num_tokens and swa_max_total_num_tokens based on the given swa_full_tokens_ratio.
1031
+ total_tokens = (
1032
+ self.max_total_num_tokens * self.model_config.num_hidden_layers
1033
+ )
1034
+ full_layers_num = len(full_attention_layer_ids)
1035
+ swa_layers_num = len(swa_attention_layer_ids)
1036
+ swa_full_tokens_ratio = self.server_args.swa_full_tokens_ratio
1037
+
1038
+ # Solve the equations:
1039
+ # 1. swa_max_total_num_tokens * swa_layers_num + full_max_total_num_tokens * full_layers_num == total_tokens
1040
+ # 2. full_max_total_num_tokens * swa_full_tokens_ratio == swa_max_total_num_tokens
1041
+ denominator = swa_full_tokens_ratio * swa_layers_num + full_layers_num
1042
+ self.full_max_total_num_tokens = int(total_tokens / denominator)
1043
+ self.swa_max_total_num_tokens = int(
1044
+ self.full_max_total_num_tokens * swa_full_tokens_ratio
1045
+ )
1046
+ self.max_total_num_tokens = self.full_max_total_num_tokens
1047
+
1048
+ logger.info(
1049
+ f"Use Sliding window memory pool. full_layer_tokens={self.full_max_total_num_tokens}, swa_layer_tokens={self.swa_max_total_num_tokens}"
997
1050
  )
998
1051
 
999
1052
  def init_memory_pool(
@@ -1072,7 +1125,6 @@ class ModelRunner:
1072
1125
  // self.server_args.page_size
1073
1126
  * self.server_args.page_size
1074
1127
  )
1075
-
1076
1128
  # create token size for hybrid cache
1077
1129
  if self.is_hybrid:
1078
1130
  self.set_num_token_hybrid()
@@ -1457,11 +1509,34 @@ class ModelRunner:
1457
1509
  **kwargs,
1458
1510
  )
1459
1511
 
1512
+ def forward_split_prefill(
1513
+ self,
1514
+ forward_batch: ForwardBatch,
1515
+ reinit_attn_backend: bool = False,
1516
+ forward_count: int = 1,
1517
+ ) -> LogitsProcessorOutput:
1518
+ if forward_batch.split_index == 0 or reinit_attn_backend:
1519
+ self.attn_backend.init_forward_metadata(forward_batch)
1520
+ next_split_index = min(
1521
+ forward_batch.split_index + forward_count,
1522
+ self.model_config.num_hidden_layers,
1523
+ )
1524
+ ret = self.model.forward_split_prefill(
1525
+ forward_batch.input_ids,
1526
+ forward_batch.positions,
1527
+ forward_batch,
1528
+ (forward_batch.split_index, next_split_index),
1529
+ )
1530
+ forward_batch.split_index = next_split_index
1531
+ return ret
1532
+
1460
1533
  def forward(
1461
1534
  self,
1462
1535
  forward_batch: ForwardBatch,
1463
1536
  skip_attn_backend_init: bool = False,
1464
1537
  pp_proxy_tensors: Optional[PPProxyTensors] = None,
1538
+ reinit_attn_backend: bool = False,
1539
+ split_forward_count: int = 1,
1465
1540
  ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
1466
1541
  self.forward_pass_id += 1
1467
1542
 
@@ -1470,7 +1545,11 @@ class ModelRunner:
1470
1545
  forward_batch,
1471
1546
  ):
1472
1547
  output = self._forward_raw(
1473
- forward_batch, skip_attn_backend_init, pp_proxy_tensors
1548
+ forward_batch,
1549
+ skip_attn_backend_init,
1550
+ pp_proxy_tensors,
1551
+ reinit_attn_backend,
1552
+ split_forward_count,
1474
1553
  )
1475
1554
 
1476
1555
  if self.eplb_manager is not None:
@@ -1483,6 +1562,8 @@ class ModelRunner:
1483
1562
  forward_batch: ForwardBatch,
1484
1563
  skip_attn_backend_init: bool,
1485
1564
  pp_proxy_tensors: Optional[PPProxyTensors],
1565
+ reinit_attn_backend: bool = False,
1566
+ split_forward_count: int = 1,
1486
1567
  ) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
1487
1568
  can_run_cuda_graph = bool(
1488
1569
  forward_batch.forward_mode.is_cuda_graph()
@@ -1503,6 +1584,12 @@ class ModelRunner:
1503
1584
  skip_attn_backend_init=skip_attn_backend_init,
1504
1585
  pp_proxy_tensors=pp_proxy_tensors,
1505
1586
  )
1587
+ elif forward_batch.forward_mode.is_split_prefill():
1588
+ ret = self.forward_split_prefill(
1589
+ forward_batch,
1590
+ reinit_attn_backend=reinit_attn_backend,
1591
+ forward_count=split_forward_count,
1592
+ )
1506
1593
  elif forward_batch.forward_mode.is_idle():
1507
1594
  ret = self.forward_idle(forward_batch, pp_proxy_tensors=pp_proxy_tensors)
1508
1595
  else:
@@ -575,7 +575,13 @@ class DummyModelLoader(BaseModelLoader):
575
575
  # 2. Post-processing of weights, including assigning specific member variables.
576
576
  # For `dummy_init`, only the second stage is required.
577
577
  if hasattr(model, "post_load_weights"):
578
- model.post_load_weights()
578
+ if (
579
+ model_config.hf_config.architectures[0]
580
+ == "DeepseekV3ForCausalLMNextN"
581
+ ):
582
+ model.post_load_weights(is_nextn=True)
583
+ else:
584
+ model.post_load_weights()
579
585
 
580
586
  return model.eval()
581
587
 
@@ -56,14 +56,14 @@ def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str
56
56
  "if the model is custom)."
57
57
  )
58
58
  model_module = auto_modules["AutoModel"]
59
- if model_config.impl == ModelImpl.TRANSFORMERS:
59
+ if model_config.model_impl == ModelImpl.TRANSFORMERS:
60
60
  if not model_module.is_backend_compatible():
61
61
  raise ValueError(
62
62
  f"The Transformers implementation of {arch} is not "
63
- "compatible with vLLM."
63
+ "compatible with SGLang."
64
64
  )
65
65
  architectures[i] = "TransformersForCausalLM"
66
- if model_config.impl == ModelImpl.AUTO:
66
+ if model_config.model_impl == ModelImpl.AUTO:
67
67
  if not model_module.is_backend_compatible():
68
68
  raise ValueError(
69
69
  f"{arch} has no SGlang implementation and the Transformers "
@@ -97,7 +97,7 @@ def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module],
97
97
  supported_archs = ModelRegistry.get_supported_archs()
98
98
  is_native_supported = any(arch in supported_archs for arch in architectures)
99
99
 
100
- if not is_native_supported or model_config.impl == ModelImpl.TRANSFORMERS:
100
+ if not is_native_supported or model_config.model_impl == ModelImpl.TRANSFORMERS:
101
101
  architectures = resolve_transformers_arch(model_config, architectures)
102
102
 
103
103
  return ModelRegistry.resolve_model_cls(architectures)
sglang/srt/models/clip.py CHANGED
@@ -463,7 +463,7 @@ class CLIPModel(nn.Module):
463
463
  if forward_batch.mm_inputs is not None:
464
464
  mm_inputs = forward_batch.mm_inputs
465
465
  pixel_values_list = [
466
- item.pixel_values
466
+ item.feature
467
467
  for item in flatten_nested_list(
468
468
  [mm_input.mm_items for mm_input in mm_inputs if mm_input is not None]
469
469
  )
@@ -37,6 +37,7 @@ from sglang.srt.layers.linear import (
37
37
  )
38
38
  from sglang.srt.layers.logits_processor import LogitsProcessor
39
39
  from sglang.srt.layers.moe.fused_moe_triton import fused_moe
40
+ from sglang.srt.layers.moe.topk import TopK
40
41
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
42
  from sglang.srt.layers.radix_attention import RadixAttention
42
43
  from sglang.srt.layers.rotary_embedding import get_rope
@@ -109,7 +110,10 @@ class DeepseekMoE(nn.Module):
109
110
  f"Tensor parallel size {self.tp_size} is greater than "
110
111
  f"the number of experts {self.n_routed_experts}."
111
112
  )
112
-
113
+ self.topk = TopK(
114
+ top_k=self.top_k,
115
+ renormalize=config.norm_topk_prob,
116
+ )
113
117
  self.experts = nn.ModuleList(
114
118
  [
115
119
  DeepseekMLP(
@@ -170,13 +174,12 @@ class DeepseekMoE(nn.Module):
170
174
  shared_output = self.shared_experts(hidden_states)
171
175
  # router_logits: (num_tokens, n_experts)
172
176
  router_logits, _ = self.gate(hidden_states)
177
+ topk_output = self.topk(hidden_states, router_logits)
173
178
  final_hidden_states = fused_moe.fused_moe(
174
179
  hidden_states,
175
- self.w1,
176
- self.w2,
177
- router_logits,
178
- self.top_k,
179
- renormalize=self.config.norm_topk_prob,
180
+ w1=self.w1,
181
+ w2=self.w2,
182
+ topk_output=topk_output,
180
183
  inplace=True,
181
184
  )
182
185
 
@@ -1960,7 +1960,7 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
1960
1960
  self.logits_processor = LogitsProcessor(config)
1961
1961
 
1962
1962
  def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
1963
- pixel_values = torch.concat([item.pixel_values for item in items], dim=0)
1963
+ pixel_values = torch.concat([item.feature for item in items], dim=0)
1964
1964
  bs, n = pixel_values.shape[0:2]
1965
1965
  pixel_values = pixel_values.to(
1966
1966
  device=self.vision_model.device, dtype=self.vision_model.dtype