sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -1
- sglang/eval/loogle_eval.py +7 -0
- sglang/srt/configs/deepseekvl2.py +11 -2
- sglang/srt/configs/internvl.py +3 -0
- sglang/srt/configs/janus_pro.py +3 -0
- sglang/srt/configs/model_config.py +9 -7
- sglang/srt/configs/update_config.py +3 -1
- sglang/srt/conversation.py +1 -0
- sglang/srt/custom_op.py +5 -2
- sglang/srt/disaggregation/decode.py +9 -1
- sglang/srt/disaggregation/mooncake/conn.py +44 -56
- sglang/srt/distributed/parallel_state.py +33 -0
- sglang/srt/entrypoints/engine.py +30 -26
- sglang/srt/entrypoints/openai/serving_chat.py +21 -2
- sglang/srt/eplb/expert_location_dispatch.py +1 -1
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/qwen3_detector.py +150 -0
- sglang/srt/hf_transformers_utils.py +0 -1
- sglang/srt/layers/activation.py +13 -0
- sglang/srt/layers/attention/flashattention_backend.py +3 -3
- sglang/srt/layers/attention/flashinfer_backend.py +40 -1
- sglang/srt/layers/linear.py +13 -102
- sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
- sglang/srt/layers/moe/ep_moe/layer.py +23 -402
- sglang/srt/layers/moe/fused_moe_native.py +7 -47
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
- sglang/srt/layers/moe/topk.py +187 -12
- sglang/srt/layers/quantization/__init__.py +20 -134
- sglang/srt/layers/quantization/awq.py +578 -11
- sglang/srt/layers/quantization/awq_triton.py +339 -0
- sglang/srt/layers/quantization/base_config.py +85 -10
- sglang/srt/layers/quantization/blockwise_int8.py +17 -55
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
- sglang/srt/layers/quantization/fp8.py +273 -62
- sglang/srt/layers/quantization/fp8_kernel.py +210 -46
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/gptq.py +501 -143
- sglang/srt/layers/quantization/marlin_utils.py +790 -0
- sglang/srt/layers/quantization/modelopt_quant.py +26 -108
- sglang/srt/layers/quantization/moe_wna16.py +45 -49
- sglang/srt/layers/quantization/petit.py +252 -0
- sglang/srt/layers/quantization/petit_utils.py +104 -0
- sglang/srt/layers/quantization/qoq.py +7 -6
- sglang/srt/layers/quantization/scalar_type.py +352 -0
- sglang/srt/layers/quantization/unquant.py +422 -0
- sglang/srt/layers/quantization/utils.py +343 -3
- sglang/srt/layers/quantization/w4afp8.py +8 -4
- sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
- sglang/srt/layers/quantization/w8a8_int8.py +51 -115
- sglang/srt/layers/vocab_parallel_embedding.py +1 -41
- sglang/srt/lora/lora.py +0 -4
- sglang/srt/lora/lora_manager.py +87 -53
- sglang/srt/lora/mem_pool.py +81 -33
- sglang/srt/lora/utils.py +12 -5
- sglang/srt/managers/cache_controller.py +241 -0
- sglang/srt/managers/io_struct.py +41 -29
- sglang/srt/managers/mm_utils.py +7 -8
- sglang/srt/managers/schedule_batch.py +150 -110
- sglang/srt/managers/schedule_policy.py +68 -27
- sglang/srt/managers/scheduler.py +243 -61
- sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
- sglang/srt/managers/tokenizer_manager.py +11 -3
- sglang/srt/managers/tp_worker.py +14 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
- sglang/srt/mem_cache/allocator.py +7 -16
- sglang/srt/mem_cache/base_prefix_cache.py +14 -2
- sglang/srt/mem_cache/chunk_cache.py +5 -2
- sglang/srt/mem_cache/hicache_storage.py +152 -0
- sglang/srt/mem_cache/hiradix_cache.py +179 -4
- sglang/srt/mem_cache/memory_pool.py +16 -1
- sglang/srt/mem_cache/memory_pool_host.py +41 -2
- sglang/srt/mem_cache/radix_cache.py +26 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +5 -6
- sglang/srt/model_executor/forward_batch_info.py +14 -1
- sglang/srt/model_executor/model_runner.py +109 -22
- sglang/srt/model_loader/loader.py +7 -1
- sglang/srt/model_loader/utils.py +4 -4
- sglang/srt/models/clip.py +1 -1
- sglang/srt/models/deepseek.py +9 -6
- sglang/srt/models/deepseek_janus_pro.py +1 -1
- sglang/srt/models/deepseek_v2.py +191 -171
- sglang/srt/models/deepseek_vl2.py +5 -5
- sglang/srt/models/gemma.py +48 -0
- sglang/srt/models/gemma2.py +52 -0
- sglang/srt/models/gemma3_causal.py +63 -0
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -4
- sglang/srt/models/granitemoe.py +385 -0
- sglang/srt/models/grok.py +9 -3
- sglang/srt/models/hunyuan.py +63 -16
- sglang/srt/models/internvl.py +1 -1
- sglang/srt/models/kimi_vl.py +1 -1
- sglang/srt/models/llama.py +41 -0
- sglang/srt/models/llama4.py +11 -11
- sglang/srt/models/llava.py +2 -2
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +0 -2
- sglang/srt/models/minicpmo.py +3 -7
- sglang/srt/models/minicpmv.py +1 -1
- sglang/srt/models/mistral.py +1 -1
- sglang/srt/models/mixtral.py +9 -2
- sglang/srt/models/mllama.py +3 -5
- sglang/srt/models/mllama4.py +3 -3
- sglang/srt/models/olmoe.py +8 -5
- sglang/srt/models/persimmon.py +330 -0
- sglang/srt/models/phi.py +321 -0
- sglang/srt/models/phi4mm.py +44 -4
- sglang/srt/models/phi4mm_audio.py +1260 -0
- sglang/srt/models/phi4mm_utils.py +1917 -0
- sglang/srt/models/phimoe.py +9 -3
- sglang/srt/models/qwen.py +37 -0
- sglang/srt/models/qwen2.py +41 -0
- sglang/srt/models/qwen2_5_vl.py +4 -4
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +53 -5
- sglang/srt/models/qwen2_vl.py +4 -4
- sglang/srt/models/qwen3.py +65 -1
- sglang/srt/models/qwen3_moe.py +56 -18
- sglang/srt/models/vila.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +91 -97
- sglang/srt/multimodal/processors/clip.py +21 -19
- sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
- sglang/srt/multimodal/processors/gemma3.py +13 -17
- sglang/srt/multimodal/processors/gemma3n.py +19 -23
- sglang/srt/multimodal/processors/internvl.py +9 -10
- sglang/srt/multimodal/processors/janus_pro.py +12 -27
- sglang/srt/multimodal/processors/kimi_vl.py +12 -14
- sglang/srt/multimodal/processors/llava.py +4 -2
- sglang/srt/multimodal/processors/minicpm.py +35 -44
- sglang/srt/multimodal/processors/mlama.py +21 -18
- sglang/srt/multimodal/processors/mllama4.py +4 -5
- sglang/srt/multimodal/processors/phi4mm.py +63 -39
- sglang/srt/multimodal/processors/pixtral.py +14 -35
- sglang/srt/multimodal/processors/qwen_audio.py +65 -0
- sglang/srt/multimodal/processors/qwen_vl.py +16 -21
- sglang/srt/multimodal/processors/vila.py +14 -14
- sglang/srt/sampling/sampling_params.py +8 -1
- sglang/srt/server_args.py +393 -230
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
- sglang/srt/two_batch_overlap.py +1 -0
- sglang/srt/utils.py +27 -1
- sglang/test/runners.py +14 -3
- sglang/test/test_block_fp8.py +8 -3
- sglang/test/test_block_fp8_ep.py +1 -1
- sglang/test/test_custom_ops.py +12 -7
- sglang/test/test_cutlass_w4a8_moe.py +1 -3
- sglang/test/test_fp4_moe.py +1 -3
- sglang/test/test_marlin_moe.py +286 -0
- sglang/test/test_marlin_utils.py +171 -0
- sglang/test/test_utils.py +35 -0
- sglang/version.py +1 -1
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
- sglang/srt/layers/quantization/quant_utils.py +0 -166
- sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0
sglang/srt/metrics/collector.py
CHANGED
@@ -145,6 +145,7 @@ class SchedulerStats:
|
|
145
145
|
num_prefill_infight_queue_reqs: int = 0
|
146
146
|
num_decode_prealloc_queue_reqs: int = 0
|
147
147
|
num_decode_transfer_queue_reqs: int = 0
|
148
|
+
total_retracted_reqs: int = 0
|
148
149
|
|
149
150
|
|
150
151
|
class SchedulerMetricsCollector:
|
@@ -219,6 +220,13 @@ class SchedulerMetricsCollector:
|
|
219
220
|
multiprocess_mode="mostrecent",
|
220
221
|
)
|
221
222
|
|
223
|
+
self.total_retracted_reqs = Gauge(
|
224
|
+
name="sglang:total_retracted_reqs",
|
225
|
+
documentation="The total number of retracted requests due to kvcache full.",
|
226
|
+
labelnames=labels.keys(),
|
227
|
+
multiprocess_mode="mostrecent",
|
228
|
+
)
|
229
|
+
|
222
230
|
# Disaggregation queue metrics
|
223
231
|
self.num_prefill_prealloc_queue_reqs = Gauge(
|
224
232
|
name="sglang:num_prefill_prealloc_queue_reqs",
|
@@ -279,6 +287,7 @@ class SchedulerMetricsCollector:
|
|
279
287
|
self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
|
280
288
|
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
|
281
289
|
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
|
290
|
+
self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
|
282
291
|
|
283
292
|
# Disaggregation metrics
|
284
293
|
self._log_gauge(
|
@@ -264,7 +264,7 @@ class CudaGraphRunner:
|
|
264
264
|
if self.enable_torch_compile:
|
265
265
|
set_torch_compile_config()
|
266
266
|
|
267
|
-
if self.model_runner.server_args.
|
267
|
+
if self.model_runner.server_args.enable_lora:
|
268
268
|
self.model_runner.lora_manager.init_cuda_graph_batch_info(self.max_bs)
|
269
269
|
|
270
270
|
# Graph inputs
|
@@ -510,11 +510,10 @@ class CudaGraphRunner:
|
|
510
510
|
spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
|
511
511
|
)
|
512
512
|
|
513
|
-
if self.model_runner.server_args.
|
514
|
-
#
|
515
|
-
#
|
516
|
-
|
517
|
-
lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs
|
513
|
+
if self.model_runner.server_args.enable_lora:
|
514
|
+
# It is safe to capture CUDA graph using empty LoRA path, as the LoRA kernels will always be launched whenever
|
515
|
+
# `--enable-lora` is set to True (and return immediately if the LoRA path is empty for perf optimization).
|
516
|
+
lora_paths = [None] * bs
|
518
517
|
else:
|
519
518
|
lora_paths = None
|
520
519
|
|
@@ -68,6 +68,8 @@ class ForwardMode(IntEnum):
|
|
68
68
|
MIXED = auto()
|
69
69
|
# No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
|
70
70
|
IDLE = auto()
|
71
|
+
# Split Prefill for PD multiplexing
|
72
|
+
SPLIT_PREFILL = auto()
|
71
73
|
|
72
74
|
# Used in speculative decoding: verify a batch in the target model.
|
73
75
|
TARGET_VERIFY = auto()
|
@@ -95,6 +97,9 @@ class ForwardMode(IntEnum):
|
|
95
97
|
def is_mixed(self):
|
96
98
|
return self == ForwardMode.MIXED
|
97
99
|
|
100
|
+
def is_split_prefill(self):
|
101
|
+
return self == ForwardMode.SPLIT_PREFILL
|
102
|
+
|
98
103
|
def is_idle(self):
|
99
104
|
return self == ForwardMode.IDLE
|
100
105
|
|
@@ -194,6 +199,14 @@ class ForwardBatch:
|
|
194
199
|
extend_logprob_start_lens_cpu: Optional[List[int]] = None
|
195
200
|
extend_input_logprob_token_ids_gpu: Optional[torch.Tensor] = None
|
196
201
|
|
202
|
+
# For split prefill
|
203
|
+
# intermediate values for split prefill
|
204
|
+
hidden_states: torch.Tensor = None
|
205
|
+
residual: torch.Tensor = None
|
206
|
+
model_specific_states: Dict[str, any] = None
|
207
|
+
# current split index of layer
|
208
|
+
split_index: int = 0
|
209
|
+
|
197
210
|
# For MLA chunked prefix cache used in chunked prefill
|
198
211
|
# Tell attention backend whether the kv cache needs to be attended in current pass
|
199
212
|
attn_attend_prefix_cache: Optional[bool] = None
|
@@ -405,7 +418,7 @@ class ForwardBatch:
|
|
405
418
|
ret._compute_mrope_positions(model_runner, batch)
|
406
419
|
|
407
420
|
# Init lora information
|
408
|
-
if model_runner.server_args.
|
421
|
+
if model_runner.server_args.enable_lora:
|
409
422
|
model_runner.lora_manager.prepare_lora_batch(ret)
|
410
423
|
|
411
424
|
TboForwardBatchPreparer.prepare(
|
@@ -275,6 +275,15 @@ class ModelRunner:
|
|
275
275
|
self.sampler = Sampler()
|
276
276
|
self.load_model()
|
277
277
|
|
278
|
+
if (
|
279
|
+
not self.server_args.disable_hybrid_swa_memory
|
280
|
+
and self.sliding_window_size is not None
|
281
|
+
and self.sliding_window_size > 0
|
282
|
+
):
|
283
|
+
architectures = self.model_config.hf_config.architectures
|
284
|
+
if architectures and not any("Llama4" in arch for arch in architectures):
|
285
|
+
self.is_hybrid = self.model_config.is_hybrid = True
|
286
|
+
|
278
287
|
self.start_layer = getattr(self.model, "start_layer", 0)
|
279
288
|
self.end_layer = getattr(
|
280
289
|
self.model, "end_layer", self.model_config.num_hidden_layers
|
@@ -295,11 +304,7 @@ class ModelRunner:
|
|
295
304
|
self.apply_torch_tp()
|
296
305
|
|
297
306
|
# Init lora
|
298
|
-
|
299
|
-
# a new server arg `enable_lora` to control whether to init LoRA manager to be more
|
300
|
-
# explicit, as it is perfectly valid to start a server with an empty lora_paths and
|
301
|
-
# load LoRA adapters dynamically later.
|
302
|
-
if server_args.lora_paths is not None:
|
307
|
+
if server_args.enable_lora:
|
303
308
|
self.init_lora_manager()
|
304
309
|
|
305
310
|
# Init memory pool and attention backends
|
@@ -402,7 +407,7 @@ class ModelRunner:
|
|
402
407
|
else:
|
403
408
|
server_args.attention_backend = "triton"
|
404
409
|
logger.info(
|
405
|
-
f"Attention backend not
|
410
|
+
f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default."
|
406
411
|
)
|
407
412
|
elif self.use_mla_backend:
|
408
413
|
if server_args.device != "cpu":
|
@@ -454,7 +459,7 @@ class ModelRunner:
|
|
454
459
|
if not self.is_multimodal_chunked_prefill_supported:
|
455
460
|
server_args.chunked_prefill_size = -1
|
456
461
|
logger.info(
|
457
|
-
f"Automatically turn
|
462
|
+
f"Automatically turn off --chunked-prefill-size as it is not supported for "
|
458
463
|
f"{self.model_config.hf_config.model_type}"
|
459
464
|
)
|
460
465
|
|
@@ -471,10 +476,6 @@ class ModelRunner:
|
|
471
476
|
if self.model_config.context_len > 8192:
|
472
477
|
self.mem_fraction_static *= 0.85
|
473
478
|
|
474
|
-
if self.is_hybrid and not server_args.disable_radix_cache:
|
475
|
-
logger.info("Automatically disable radix cache for hybrid cache.")
|
476
|
-
server_args.disable_radix_cache = True
|
477
|
-
|
478
479
|
def init_torch_distributed(self):
|
479
480
|
logger.info("Init torch distributed begin.")
|
480
481
|
|
@@ -534,6 +535,7 @@ class ModelRunner:
|
|
534
535
|
initialize_model_parallel(
|
535
536
|
tensor_model_parallel_size=self.tp_size,
|
536
537
|
pipeline_model_parallel_size=self.pp_size,
|
538
|
+
duplicate_tp_group=self.server_args.enable_pdmux,
|
537
539
|
)
|
538
540
|
initialize_dp_attention(
|
539
541
|
enable_dp_attention=self.server_args.enable_dp_attention,
|
@@ -555,7 +557,7 @@ class ModelRunner:
|
|
555
557
|
|
556
558
|
# Check memory for tensor parallelism
|
557
559
|
local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
|
558
|
-
if self.tp_size > 1:
|
560
|
+
if self.tp_size > 1 and not self.is_draft_worker:
|
559
561
|
if min_per_gpu_memory < local_gpu_memory * 0.9:
|
560
562
|
if get_bool_env_var("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK"):
|
561
563
|
logger.warning(
|
@@ -645,11 +647,15 @@ class ModelRunner:
|
|
645
647
|
)
|
646
648
|
|
647
649
|
# Parse other args
|
648
|
-
self.sliding_window_size =
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
650
|
+
self.sliding_window_size = None
|
651
|
+
if hasattr(self.model, "get_attention_sliding_window_size"):
|
652
|
+
self.sliding_window_size = self.model.get_attention_sliding_window_size()
|
653
|
+
elif self.model_config.attention_chunk_size is not None:
|
654
|
+
self.sliding_window_size = self.model_config.attention_chunk_size
|
655
|
+
print(
|
656
|
+
f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}"
|
657
|
+
)
|
658
|
+
|
653
659
|
self.dtype = self.model_config.dtype
|
654
660
|
|
655
661
|
after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
|
@@ -882,8 +888,10 @@ class ModelRunner:
|
|
882
888
|
lora_backend=self.server_args.lora_backend,
|
883
889
|
tp_size=self.tp_size,
|
884
890
|
tp_rank=self.tp_rank,
|
891
|
+
max_lora_rank=self.server_args.max_lora_rank,
|
892
|
+
target_modules=self.server_args.lora_target_modules,
|
885
893
|
)
|
886
|
-
result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths)
|
894
|
+
result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths or {})
|
887
895
|
if result.success:
|
888
896
|
logger.info(
|
889
897
|
f"LoRA manager ready. Loaded LoRA adapters: {', '.join(result.loaded_adapters)}"
|
@@ -992,8 +1000,53 @@ class ModelRunner:
|
|
992
1000
|
)
|
993
1001
|
self.max_total_num_tokens = self.full_max_total_num_tokens
|
994
1002
|
else:
|
995
|
-
|
996
|
-
|
1003
|
+
assert self.sliding_window_size is not None and self.sliding_window_size > 0
|
1004
|
+
full_attention_layer_ids = []
|
1005
|
+
swa_attention_layer_ids = []
|
1006
|
+
|
1007
|
+
try:
|
1008
|
+
layers = self.model.model.layers
|
1009
|
+
except:
|
1010
|
+
try:
|
1011
|
+
layers = self.model.language_model.model.layers
|
1012
|
+
except:
|
1013
|
+
self.is_hybrid = False
|
1014
|
+
return
|
1015
|
+
|
1016
|
+
for layer in layers:
|
1017
|
+
if (
|
1018
|
+
layer.self_attn.attn.sliding_window_size is None
|
1019
|
+
or layer.self_attn.attn.sliding_window_size == -1
|
1020
|
+
):
|
1021
|
+
full_attention_layer_ids.append(layer.layer_id)
|
1022
|
+
else:
|
1023
|
+
swa_attention_layer_ids.append(layer.layer_id)
|
1024
|
+
self.model_config.swa_attention_layer_ids = swa_attention_layer_ids
|
1025
|
+
self.model_config.full_attention_layer_ids = full_attention_layer_ids
|
1026
|
+
|
1027
|
+
# Algorithm:
|
1028
|
+
# Existing max_total_num_tokens is per layer and assume all layers have the same number of tokens.
|
1029
|
+
# - Find total # of tokens available across layers.
|
1030
|
+
# - Calculate full_max_total_num_tokens and swa_max_total_num_tokens based on the given swa_full_tokens_ratio.
|
1031
|
+
total_tokens = (
|
1032
|
+
self.max_total_num_tokens * self.model_config.num_hidden_layers
|
1033
|
+
)
|
1034
|
+
full_layers_num = len(full_attention_layer_ids)
|
1035
|
+
swa_layers_num = len(swa_attention_layer_ids)
|
1036
|
+
swa_full_tokens_ratio = self.server_args.swa_full_tokens_ratio
|
1037
|
+
|
1038
|
+
# Solve the equations:
|
1039
|
+
# 1. swa_max_total_num_tokens * swa_layers_num + full_max_total_num_tokens * full_layers_num == total_tokens
|
1040
|
+
# 2. full_max_total_num_tokens * swa_full_tokens_ratio == swa_max_total_num_tokens
|
1041
|
+
denominator = swa_full_tokens_ratio * swa_layers_num + full_layers_num
|
1042
|
+
self.full_max_total_num_tokens = int(total_tokens / denominator)
|
1043
|
+
self.swa_max_total_num_tokens = int(
|
1044
|
+
self.full_max_total_num_tokens * swa_full_tokens_ratio
|
1045
|
+
)
|
1046
|
+
self.max_total_num_tokens = self.full_max_total_num_tokens
|
1047
|
+
|
1048
|
+
logger.info(
|
1049
|
+
f"Use Sliding window memory pool. full_layer_tokens={self.full_max_total_num_tokens}, swa_layer_tokens={self.swa_max_total_num_tokens}"
|
997
1050
|
)
|
998
1051
|
|
999
1052
|
def init_memory_pool(
|
@@ -1072,7 +1125,6 @@ class ModelRunner:
|
|
1072
1125
|
// self.server_args.page_size
|
1073
1126
|
* self.server_args.page_size
|
1074
1127
|
)
|
1075
|
-
|
1076
1128
|
# create token size for hybrid cache
|
1077
1129
|
if self.is_hybrid:
|
1078
1130
|
self.set_num_token_hybrid()
|
@@ -1457,11 +1509,34 @@ class ModelRunner:
|
|
1457
1509
|
**kwargs,
|
1458
1510
|
)
|
1459
1511
|
|
1512
|
+
def forward_split_prefill(
|
1513
|
+
self,
|
1514
|
+
forward_batch: ForwardBatch,
|
1515
|
+
reinit_attn_backend: bool = False,
|
1516
|
+
forward_count: int = 1,
|
1517
|
+
) -> LogitsProcessorOutput:
|
1518
|
+
if forward_batch.split_index == 0 or reinit_attn_backend:
|
1519
|
+
self.attn_backend.init_forward_metadata(forward_batch)
|
1520
|
+
next_split_index = min(
|
1521
|
+
forward_batch.split_index + forward_count,
|
1522
|
+
self.model_config.num_hidden_layers,
|
1523
|
+
)
|
1524
|
+
ret = self.model.forward_split_prefill(
|
1525
|
+
forward_batch.input_ids,
|
1526
|
+
forward_batch.positions,
|
1527
|
+
forward_batch,
|
1528
|
+
(forward_batch.split_index, next_split_index),
|
1529
|
+
)
|
1530
|
+
forward_batch.split_index = next_split_index
|
1531
|
+
return ret
|
1532
|
+
|
1460
1533
|
def forward(
|
1461
1534
|
self,
|
1462
1535
|
forward_batch: ForwardBatch,
|
1463
1536
|
skip_attn_backend_init: bool = False,
|
1464
1537
|
pp_proxy_tensors: Optional[PPProxyTensors] = None,
|
1538
|
+
reinit_attn_backend: bool = False,
|
1539
|
+
split_forward_count: int = 1,
|
1465
1540
|
) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
|
1466
1541
|
self.forward_pass_id += 1
|
1467
1542
|
|
@@ -1470,7 +1545,11 @@ class ModelRunner:
|
|
1470
1545
|
forward_batch,
|
1471
1546
|
):
|
1472
1547
|
output = self._forward_raw(
|
1473
|
-
forward_batch,
|
1548
|
+
forward_batch,
|
1549
|
+
skip_attn_backend_init,
|
1550
|
+
pp_proxy_tensors,
|
1551
|
+
reinit_attn_backend,
|
1552
|
+
split_forward_count,
|
1474
1553
|
)
|
1475
1554
|
|
1476
1555
|
if self.eplb_manager is not None:
|
@@ -1483,6 +1562,8 @@ class ModelRunner:
|
|
1483
1562
|
forward_batch: ForwardBatch,
|
1484
1563
|
skip_attn_backend_init: bool,
|
1485
1564
|
pp_proxy_tensors: Optional[PPProxyTensors],
|
1565
|
+
reinit_attn_backend: bool = False,
|
1566
|
+
split_forward_count: int = 1,
|
1486
1567
|
) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
|
1487
1568
|
can_run_cuda_graph = bool(
|
1488
1569
|
forward_batch.forward_mode.is_cuda_graph()
|
@@ -1503,6 +1584,12 @@ class ModelRunner:
|
|
1503
1584
|
skip_attn_backend_init=skip_attn_backend_init,
|
1504
1585
|
pp_proxy_tensors=pp_proxy_tensors,
|
1505
1586
|
)
|
1587
|
+
elif forward_batch.forward_mode.is_split_prefill():
|
1588
|
+
ret = self.forward_split_prefill(
|
1589
|
+
forward_batch,
|
1590
|
+
reinit_attn_backend=reinit_attn_backend,
|
1591
|
+
forward_count=split_forward_count,
|
1592
|
+
)
|
1506
1593
|
elif forward_batch.forward_mode.is_idle():
|
1507
1594
|
ret = self.forward_idle(forward_batch, pp_proxy_tensors=pp_proxy_tensors)
|
1508
1595
|
else:
|
@@ -575,7 +575,13 @@ class DummyModelLoader(BaseModelLoader):
|
|
575
575
|
# 2. Post-processing of weights, including assigning specific member variables.
|
576
576
|
# For `dummy_init`, only the second stage is required.
|
577
577
|
if hasattr(model, "post_load_weights"):
|
578
|
-
|
578
|
+
if (
|
579
|
+
model_config.hf_config.architectures[0]
|
580
|
+
== "DeepseekV3ForCausalLMNextN"
|
581
|
+
):
|
582
|
+
model.post_load_weights(is_nextn=True)
|
583
|
+
else:
|
584
|
+
model.post_load_weights()
|
579
585
|
|
580
586
|
return model.eval()
|
581
587
|
|
sglang/srt/model_loader/utils.py
CHANGED
@@ -56,14 +56,14 @@ def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str
|
|
56
56
|
"if the model is custom)."
|
57
57
|
)
|
58
58
|
model_module = auto_modules["AutoModel"]
|
59
|
-
if model_config.
|
59
|
+
if model_config.model_impl == ModelImpl.TRANSFORMERS:
|
60
60
|
if not model_module.is_backend_compatible():
|
61
61
|
raise ValueError(
|
62
62
|
f"The Transformers implementation of {arch} is not "
|
63
|
-
"compatible with
|
63
|
+
"compatible with SGLang."
|
64
64
|
)
|
65
65
|
architectures[i] = "TransformersForCausalLM"
|
66
|
-
if model_config.
|
66
|
+
if model_config.model_impl == ModelImpl.AUTO:
|
67
67
|
if not model_module.is_backend_compatible():
|
68
68
|
raise ValueError(
|
69
69
|
f"{arch} has no SGlang implementation and the Transformers "
|
@@ -97,7 +97,7 @@ def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module],
|
|
97
97
|
supported_archs = ModelRegistry.get_supported_archs()
|
98
98
|
is_native_supported = any(arch in supported_archs for arch in architectures)
|
99
99
|
|
100
|
-
if not is_native_supported or model_config.
|
100
|
+
if not is_native_supported or model_config.model_impl == ModelImpl.TRANSFORMERS:
|
101
101
|
architectures = resolve_transformers_arch(model_config, architectures)
|
102
102
|
|
103
103
|
return ModelRegistry.resolve_model_cls(architectures)
|
sglang/srt/models/clip.py
CHANGED
@@ -463,7 +463,7 @@ class CLIPModel(nn.Module):
|
|
463
463
|
if forward_batch.mm_inputs is not None:
|
464
464
|
mm_inputs = forward_batch.mm_inputs
|
465
465
|
pixel_values_list = [
|
466
|
-
item.
|
466
|
+
item.feature
|
467
467
|
for item in flatten_nested_list(
|
468
468
|
[mm_input.mm_items for mm_input in mm_inputs if mm_input is not None]
|
469
469
|
)
|
sglang/srt/models/deepseek.py
CHANGED
@@ -37,6 +37,7 @@ from sglang.srt.layers.linear import (
|
|
37
37
|
)
|
38
38
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
39
39
|
from sglang.srt.layers.moe.fused_moe_triton import fused_moe
|
40
|
+
from sglang.srt.layers.moe.topk import TopK
|
40
41
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
43
|
from sglang.srt.layers.rotary_embedding import get_rope
|
@@ -109,7 +110,10 @@ class DeepseekMoE(nn.Module):
|
|
109
110
|
f"Tensor parallel size {self.tp_size} is greater than "
|
110
111
|
f"the number of experts {self.n_routed_experts}."
|
111
112
|
)
|
112
|
-
|
113
|
+
self.topk = TopK(
|
114
|
+
top_k=self.top_k,
|
115
|
+
renormalize=config.norm_topk_prob,
|
116
|
+
)
|
113
117
|
self.experts = nn.ModuleList(
|
114
118
|
[
|
115
119
|
DeepseekMLP(
|
@@ -170,13 +174,12 @@ class DeepseekMoE(nn.Module):
|
|
170
174
|
shared_output = self.shared_experts(hidden_states)
|
171
175
|
# router_logits: (num_tokens, n_experts)
|
172
176
|
router_logits, _ = self.gate(hidden_states)
|
177
|
+
topk_output = self.topk(hidden_states, router_logits)
|
173
178
|
final_hidden_states = fused_moe.fused_moe(
|
174
179
|
hidden_states,
|
175
|
-
self.w1,
|
176
|
-
self.w2,
|
177
|
-
|
178
|
-
self.top_k,
|
179
|
-
renormalize=self.config.norm_topk_prob,
|
180
|
+
w1=self.w1,
|
181
|
+
w2=self.w2,
|
182
|
+
topk_output=topk_output,
|
180
183
|
inplace=True,
|
181
184
|
)
|
182
185
|
|
@@ -1960,7 +1960,7 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
|
|
1960
1960
|
self.logits_processor = LogitsProcessor(config)
|
1961
1961
|
|
1962
1962
|
def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
|
1963
|
-
pixel_values = torch.concat([item.
|
1963
|
+
pixel_values = torch.concat([item.feature for item in items], dim=0)
|
1964
1964
|
bs, n = pixel_values.shape[0:2]
|
1965
1965
|
pixel_values = pixel_values.to(
|
1966
1966
|
device=self.vision_model.device, dtype=self.vision_model.dtype
|