PyPI - sglang - Versions diffs - 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

sglang/bench_one_batch_server.py +17 -2
sglang/bench_serving.py +170 -24
sglang/srt/configs/internvl.py +4 -2
sglang/srt/configs/janus_pro.py +1 -1
sglang/srt/configs/model_config.py +60 -1
sglang/srt/configs/update_config.py +119 -0
sglang/srt/conversation.py +69 -1
sglang/srt/disaggregation/decode.py +21 -5
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/disaggregation/nixl/conn.py +6 -6
sglang/srt/disaggregation/prefill.py +2 -2
sglang/srt/disaggregation/utils.py +1 -1
sglang/srt/distributed/parallel_state.py +44 -17
sglang/srt/entrypoints/EngineBase.py +8 -0
sglang/srt/entrypoints/engine.py +40 -6
sglang/srt/entrypoints/http_server.py +111 -24
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/eplb/__init__.py +0 -0
sglang/srt/{managers → eplb}/eplb_algorithms/__init__.py +1 -1
sglang/srt/{managers → eplb}/eplb_manager.py +2 -4
sglang/srt/{eplb_simulator → eplb/eplb_simulator}/reader.py +1 -1
sglang/srt/{managers → eplb}/expert_distribution.py +1 -5
sglang/srt/{managers → eplb}/expert_location.py +1 -1
sglang/srt/{managers → eplb}/expert_location_dispatch.py +1 -1
sglang/srt/{model_executor → eplb}/expert_location_updater.py +17 -1
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/amx_utils.py +86 -0
sglang/srt/layers/attention/ascend_backend.py +219 -0
sglang/srt/layers/attention/flashattention_backend.py +32 -9
sglang/srt/layers/attention/tbo_backend.py +37 -9
sglang/srt/layers/communicator.py +20 -2
sglang/srt/layers/dp_attention.py +9 -3
sglang/srt/layers/elementwise.py +76 -12
sglang/srt/layers/flashinfer_comm_fusion.py +202 -0
sglang/srt/layers/layernorm.py +26 -0
sglang/srt/layers/linear.py +84 -14
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +81 -8
sglang/srt/layers/moe/ep_moe/layer.py +176 -15
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +23 -17
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +211 -74
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/router.py +60 -22
sglang/srt/layers/moe/topk.py +10 -28
sglang/srt/layers/parameter.py +67 -7
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +1 -1
sglang/srt/layers/quantization/fp8.py +72 -7
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -2
sglang/srt/layers/quantization/gptq.py +5 -1
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -1
sglang/srt/layers/quantization/quant_utils.py +166 -0
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +52 -1
sglang/srt/layers/rotary_embedding.py +2 -2
sglang/srt/layers/vocab_parallel_embedding.py +20 -10
sglang/srt/lora/lora.py +4 -5
sglang/srt/lora/lora_manager.py +73 -20
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/configure_logging.py +1 -1
sglang/srt/managers/io_struct.py +58 -14
sglang/srt/managers/mm_utils.py +77 -61
sglang/srt/managers/multimodal_processor.py +2 -6
sglang/srt/managers/multimodal_processors/qwen_audio.py +94 -0
sglang/srt/managers/schedule_batch.py +78 -85
sglang/srt/managers/scheduler.py +130 -64
sglang/srt/managers/scheduler_output_processor_mixin.py +8 -2
sglang/srt/managers/session_controller.py +12 -3
sglang/srt/managers/tokenizer_manager.py +314 -103
sglang/srt/managers/tp_worker.py +13 -1
sglang/srt/managers/tp_worker_overlap_thread.py +8 -0
sglang/srt/mem_cache/allocator.py +290 -0
sglang/srt/mem_cache/chunk_cache.py +34 -2
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +402 -66
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/multimodal_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/cuda_graph_runner.py +2 -1
sglang/srt/model_executor/forward_batch_info.py +17 -4
sglang/srt/model_executor/model_runner.py +297 -56
sglang/srt/model_loader/loader.py +41 -0
sglang/srt/model_loader/weight_utils.py +72 -4
sglang/srt/models/deepseek_nextn.py +1 -3
sglang/srt/models/deepseek_v2.py +195 -45
sglang/srt/models/deepseek_vl2.py +3 -5
sglang/srt/models/gemma3_causal.py +1 -2
sglang/srt/models/gemma3n_causal.py +4 -3
sglang/srt/models/gemma3n_mm.py +4 -20
sglang/srt/models/hunyuan.py +1 -1
sglang/srt/models/kimi_vl.py +1 -2
sglang/srt/models/llama.py +10 -4
sglang/srt/models/llama4.py +32 -45
sglang/srt/models/llama_eagle3.py +61 -11
sglang/srt/models/llava.py +5 -5
sglang/srt/models/minicpmo.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mllama4.py +402 -89
sglang/srt/models/phi4mm.py +1 -3
sglang/srt/models/pixtral.py +3 -7
sglang/srt/models/qwen2.py +31 -3
sglang/srt/models/qwen2_5_vl.py +1 -3
sglang/srt/models/qwen2_audio.py +200 -0
sglang/srt/models/qwen2_moe.py +32 -6
sglang/srt/models/qwen2_vl.py +1 -4
sglang/srt/models/qwen3.py +94 -25
sglang/srt/models/qwen3_moe.py +68 -21
sglang/srt/models/vila.py +3 -8
sglang/srt/{mm_utils.py → multimodal/mm_utils.py} +2 -2
sglang/srt/{managers/multimodal_processors → multimodal/processors}/base_processor.py +140 -158
sglang/srt/{managers/multimodal_processors → multimodal/processors}/clip.py +2 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/deepseek_vl_v2.py +4 -11
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/gemma3n.py +5 -20
sglang/srt/{managers/multimodal_processors → multimodal/processors}/internvl.py +3 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/janus_pro.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/kimi_vl.py +6 -13
sglang/srt/{managers/multimodal_processors → multimodal/processors}/llava.py +2 -10
sglang/srt/{managers/multimodal_processors → multimodal/processors}/minicpm.py +5 -12
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mlama.py +2 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/mllama4.py +65 -66
sglang/srt/{managers/multimodal_processors → multimodal/processors}/phi4mm.py +4 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/pixtral.py +3 -9
sglang/srt/{managers/multimodal_processors → multimodal/processors}/qwen_vl.py +8 -14
sglang/srt/{managers/multimodal_processors → multimodal/processors}/vila.py +13 -31
sglang/srt/operations_strategy.py +6 -2
sglang/srt/reasoning_parser.py +26 -0
sglang/srt/sampling/sampling_batch_info.py +39 -1
sglang/srt/server_args.py +84 -22
sglang/srt/speculative/build_eagle_tree.py +57 -18
sglang/srt/speculative/eagle_worker.py +6 -4
sglang/srt/two_batch_overlap.py +203 -27
sglang/srt/utils.py +343 -163
sglang/srt/warmup.py +12 -3
sglang/test/runners.py +10 -1
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/test/test_utils.py +15 -3
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +12 -8
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +157 -146
sglang/math_utils.py +0 -8
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek.py +0 -0
/sglang/srt/{managers → eplb}/eplb_algorithms/deepseek_vec.py +0 -0
/sglang/srt/{eplb_simulator → eplb/eplb_simulator}/__init__.py +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.post1.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/lora/triton_ops/gate_up_lora_b.py CHANGED Viewed

@@ -31,28 +31,44 @@ def _gate_up_lora_b_kernel(
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
-    # For fused output scaling and adding
-    fuse_scaling_add,
+    # For fused output scaling
     scalings,
 ):
-    # This kernel packs 2 sgemms (gate/up) into a single kernel.
-    # x: (s, 2 * K), s is the sum of sequence lengths, K equals to lora rank
-    # weights: (num_lora, 2 * output_dim, K)
-    # output: (s, 2 * output_dim)
+    """
+    This kernel packs 2 sgemms (gate/up) into a single kernel. The multiplication
+    results are accumulated into the output tensor.
+    When a sequence's rank is 0, the kernel is essentially a no-op, following
+    the convention in pytorch where the product of two matrices of shape (m, 0)
+    and (0, n) is an all-zero matrix of shape (m, n).
+    Args:
+        x (Tensor): The input tensor, which is the result of the LoRA A projection.
+            Shape: (s, 2 * K), where s is the sum of all sequence lengths in the
+            batch and K is the maximum LoRA rank.
+        weights (Tensor): The LoRA B weights for all adapters.
+            Shape: (num_lora, 2 * output_dim, K).
+        output (Tensor): The output tensor where the result is stored.
+            Shape: (s, 2 * output_dim).
+    """
     # output_dim >> K
     # Current block computes sequence with batch_id,
     # which starts from row seg_start of x with length seg_len.
     # gate_up_id decides which of gate or up (0: gate, 1: up)
     batch_id = tl.program_id(axis=2)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    # If rank is 0, this kernel is a no-op.
+    if rank == 0:
+        return
     gate_up_id = tl.program_id(axis=1)
     pid = tl.program_id(axis=0)
     seg_len = tl.load(seg_lens + batch_id)
-    w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
     n_start = gate_up_id * output_dim  # offset on output dim
-    rank = tl.load(lora_ranks + w_index)
     scaling = tl.load(scalings + w_index)
     # Adjust K (rank) according to the specific LoRA adapter
@@ -82,14 +98,13 @@ def _gate_up_lora_b_kernel(
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(
             x_ptrs,
-            mask=(s_offset[:, None] < seg_len)
-            and (k_offset[None, :] < K - k * BLOCK_K),
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
             other=0.0,
         )
         w_tile = tl.load(
             w_ptrs,
             mask=(k_offset[:, None] < K - k * BLOCK_K)
-            and (n_offset[None, :] < output_dim),
+            & (n_offset[None, :] < output_dim),
             other=0.0,
         )
         partial_sum += tl.dot(x_tile, w_tile)
@@ -103,9 +118,8 @@ def _gate_up_lora_b_kernel(
     output_ptr = (output + seg_start * output_stride_0 + n_start * output_stride_1) + (
         s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
     )
-    output_mask = (s_offset[:, None] < seg_len) and (n_offset[None, :] < output_dim)
-    if fuse_scaling_add:
-        partial_sum += tl.load(output_ptr, mask=output_mask)
+    output_mask = (s_offset[:, None] < seg_len) & (n_offset[None, :] < output_dim)
+    partial_sum += tl.load(output_ptr, mask=output_mask)
     tl.store(output_ptr, partial_sum, mask=output_mask)
@@ -143,11 +157,9 @@ def gate_up_lora_b_fwd(
     )
     if base_output is None:
-        output = torch.empty((s, 2 * output_dim), device=x.device, dtype=x.dtype)
-        fuse_scaling_add = False
+        output = torch.zeros((s, 2 * output_dim), device=x.device, dtype=x.dtype)
     else:
         output = base_output
-        fuse_scaling_add = True
     _gate_up_lora_b_kernel[grid_b](
         x,
@@ -169,7 +181,6 @@ def gate_up_lora_b_fwd(
         BLOCK_S,
         BLOCK_OUT,
         BLOCK_R,
-        fuse_scaling_add,
         batch_info.scalings,
     )

sglang/srt/lora/triton_ops/qkv_lora_b.py CHANGED Viewed

@@ -33,29 +33,45 @@ def _qkv_lora_b_kernel(
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
-    # For fused output scaling and adding
-    fuse_scaling_add,
+    # For fused output scaling
     scalings,
 ):
-    # This kernel packs 3 sgemms (q/k/v) into a single kernel.
-    # x: (s, 3 * K), s is the sum of sequence lengths, K equals to lora rank
-    # weights: (num_lora, N_Q + 2 * N_KV, K)
-    # output: (s, N_Q + 2 * N_KV)
-    # N_Q >> K, N_KV >> K
+    """
+    This kernel packs 3 sgemms (q/k/v) into a single kernel. The multiplication
+    results are accumulated into the output tensor.
+    When a sequence's rank is 0, the kernel is essentially a no-op, following
+    the convention in pytorch where the product of two matrices of shape (m, 0)
+    and (0, n) is an all-zero matrix of shape (m, n).
+    Args:
+        x (Tensor): The input tensor, which is the result of the LoRA A projection.
+            Shape: (s, 3 * K), where s is the sum of all sequence lengths in the
+            batch and K is the maximum LoRA rank. The second dimension is partitioned
+            for Q, K, and V.
+        weights (Tensor): The LoRA B weights for all adapters.
+            Shape: (num_lora, N_Q + 2 * N_KV, K).
+        output (Tensor): The output tensor where the result is stored.
+            Shape: (s, N_Q + 2 * N_KV).
+    """
     # Current block computes sequence with batch_id,
     # which starts from row seg_start of x with length seg_len.
     # qkv_id decides which of q,k,v to compute (0: q, 1: k, 2: v)
     batch_id = tl.program_id(axis=2)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    # If rank is 0, this kernel is a no-op.
+    if rank == 0:
+        return
     qkv_id = tl.program_id(axis=1)
     pid = tl.program_id(axis=0)
     seg_len = tl.load(seg_lens + batch_id)
-    w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
     n_start = tl.load(n_offs + qkv_id)
     n_size = tl.load(n_offs + qkv_id + 1) - n_start
-    rank = tl.load(lora_ranks + w_index)
     scaling = tl.load(scalings + w_index)
     # Adjust K (rank) according to the specific LoRA adapter
     K = tl.minimum(K, rank)
@@ -84,13 +100,12 @@ def _qkv_lora_b_kernel(
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(
             x_ptrs,
-            mask=(s_offset[:, None] < seg_len)
-            and (k_offset[None, :] < K - k * BLOCK_K),
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
             other=0.0,
         )
         w_tile = tl.load(
             w_ptrs,
-            mask=(k_offset[:, None] < K - k * BLOCK_K) and (n_offset[None, :] < n_size),
+            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < n_size),
             other=0.0,
         )
         partial_sum += tl.dot(x_tile, w_tile)
@@ -105,8 +120,7 @@ def _qkv_lora_b_kernel(
         s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
     )
     output_mask = (s_offset[:, None] < seg_len) and (n_offset[None, :] < n_size)
-    if fuse_scaling_add:
-        partial_sum += tl.load(output_ptr, mask=output_mask)
+    partial_sum += tl.load(output_ptr, mask=output_mask)
     tl.store(output_ptr, partial_sum, mask=output_mask)
@@ -153,11 +167,9 @@ def qkv_lora_b_fwd(
     )
     if base_output is None:
-        output = torch.empty((s, output_dim), device=x.device, dtype=x.dtype)
-        fuse_scaling_add = False
+        output = torch.zeros((s, output_dim), device=x.device, dtype=x.dtype)
     else:
         output = base_output
-        fuse_scaling_add = True
     _qkv_lora_b_kernel[grid_b](
         x,
@@ -180,7 +192,6 @@ def qkv_lora_b_fwd(
         BLOCK_S,
         BLOCK_OUT,
         BLOCK_R,
-        fuse_scaling_add,
         batch_info.scalings,
     )

sglang/srt/lora/triton_ops/sgemm_lora_a.py CHANGED Viewed

@@ -33,19 +33,36 @@ def _sgemm_lora_a_kernel(
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
 ):
-    # x: (s, K), s is the sum of sequence lengths
-    # weights: (num_lora, N, K)
-    # output: (s, N)
+    """
+    Computes a segmented batched matrix multiplication for the LoRA A matrix.
+    The kernel ensures that output[seg_start:seg_start + seg_len, :rank * stack_num]
+    stores the product of the input `x` and the LoRA weights for the corresponding
+    sequence. This implies that when rank is 0, the kernel is essentially a no-op,
+    as output[seg_start:seg_start + seg_len, :0] is trivially correct (empty).
+    Args:
+        x (torch.Tensor): The input activations tensor of shape `(s, K)`, where `s`
+            is the sum of all sequence lengths in the batch.
+        weights (torch.Tensor): The LoRA 'A' weights for all available adapters,
+            with shape `(num_lora, N, K)`.
+        output (torch.Tensor): The output tensor of shape `(s, N)`.
+    """
     # Current block computes sequence with batch_id,
     # which starts from row seg_start of x with length seg_len
     batch_id = tl.program_id(axis=1)
-    pid = tl.program_id(axis=0)
-    seg_len = tl.load(seg_lens + batch_id)
     w_index = tl.load(weight_indices + batch_id)
-    seg_start = tl.load(seg_indptr + batch_id)
     rank = tl.load(lora_ranks + w_index)
+    # If rank is 0, this kernel becomes a no-op as the output is always trivially correct.
+    if rank == 0:
+        return
+    pid = tl.program_id(axis=0)
+    seg_start = tl.load(seg_indptr + batch_id)
+    seg_len = tl.load(seg_lens + batch_id)
     # Adjust N (stack_num * max_rank) according to the specific LoRA adapter
     N = tl.minimum(N, rank * stack_num)
@@ -72,13 +89,12 @@ def _sgemm_lora_a_kernel(
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(
             x_ptrs,
-            mask=(s_offset[:, None] < seg_len)
-            and (k_offset[None, :] < K - k * BLOCK_K),
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
             other=0.0,
         )
         w_tile = tl.load(
             w_ptrs,
-            mask=(k_offset[:, None] < K - k * BLOCK_K) and (n_offset[None, :] < N),
+            mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < N),
             other=0.0,
         )
         partial_sum += tl.dot(x_tile, w_tile)
@@ -91,7 +107,7 @@ def _sgemm_lora_a_kernel(
     output_ptr = (output + seg_start * output_stride_0) + (
         s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
     )
-    output_mask = (s_offset[:, None] < seg_len) and (n_offset[None, :] < N)
+    output_mask = (s_offset[:, None] < seg_len) & (n_offset[None, :] < N)
     tl.store(output_ptr, partial_sum, mask=output_mask)

sglang/srt/lora/triton_ops/sgemm_lora_b.py CHANGED Viewed

@@ -31,22 +31,39 @@ def _sgemm_lora_b_kernel(
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
-    # For fused output scaling and adding
-    fuse_scaling_add,
+    # For fused output scaling
     scalings,
 ):
-    # x: (s, K), s is the sum of sequence lengths
-    # weights: (num_lora, N, K)
-    # output: (s, N)
+    """
+    Computes a segmented batched matrix multiplication for the LoRA B matrix
+    and adds the result to the output in-place.
+    When a sequence's rank is 0, the kernel is essentially a no-op, following
+    the convention in pytorch where the product of two matrices of shape (m, 0)
+    and (0, n) is an all-zero matrix of shape (m, n).
+    Args:
+        x (torch.Tensor): The intermediate tensor from the LoRA 'A' multiplication,
+            of shape `(s, K)`, where `s` is the total number of tokens.
+        weights (torch.Tensor): The LoRA 'B' weights for all available adapters,
+            with shape `(num_lora, N, K)`.
+        output (torch.Tensor): The output tensor of shape `(s, N)`. This can be
+            the base model's output for a fused add operation.
+    """
     # Current block computes sequence with batch_id,
     # which starts from row seg_start of x with length seg_len
     batch_id = tl.program_id(axis=1)
+    w_index = tl.load(weight_indices + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    # If rank is 0, this kernel is a no-op.
+    if rank == 0:
+        return
     pid = tl.program_id(axis=0)
     seg_len = tl.load(seg_lens + batch_id)
-    w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
-    rank = tl.load(lora_ranks + w_index)
     scaling = tl.load(scalings + w_index)
     # Adjust K (rank) according to the specific LoRA adapter
     K = tl.minimum(K, rank)
@@ -74,8 +91,7 @@ def _sgemm_lora_b_kernel(
     for k in range(0, tl.cdiv(K, BLOCK_K)):
         x_tile = tl.load(
             x_ptrs,
-            mask=(s_offset[:, None] < seg_len)
-            and (k_offset[None, :] < K - k * BLOCK_K),
+            mask=(s_offset[:, None] < seg_len) & (k_offset[None, :] < K - k * BLOCK_K),
             other=0.0,
         )
         w_tile = tl.load(
@@ -95,8 +111,7 @@ def _sgemm_lora_b_kernel(
         s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
     )
     output_mask = s_offset[:, None] < seg_len
-    if fuse_scaling_add:
-        partial_sum += tl.load(output_ptr, mask=output_mask)
+    partial_sum += tl.load(output_ptr, mask=output_mask)
     tl.store(output_ptr, partial_sum, mask=output_mask)
@@ -132,11 +147,9 @@ def sgemm_lora_b_fwd(
     )
     if base_output is None:
-        output = torch.empty((S, N), device=x.device, dtype=x.dtype)
-        fuse_scaling_add = False
+        output = torch.zeros((S, N), device=x.device, dtype=x.dtype)
     else:
         output = base_output
-        fuse_scaling_add = True
     _sgemm_lora_b_kernel[grid](
         x,
@@ -158,7 +171,6 @@ def sgemm_lora_b_fwd(
         BLOCK_S,
         BLOCK_N,
         BLOCK_R,
-        fuse_scaling_add,
         batch_info.scalings,
     )
     return output

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-import concurrent.futures
 import logging
 import math
 import threading
@@ -169,12 +168,23 @@ class HiCacheController:
         page_size: int,
         load_cache_event: threading.Event = None,
         write_policy: str = "write_through_selective",
+        io_backend: str = "",
     ):
         self.mem_pool_device_allocator = token_to_kv_pool_allocator
         self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
         self.mem_pool_host = mem_pool_host
         self.write_policy = write_policy
         self.page_size = page_size
+        # using kernel for small page KV cache transfer and DMA for large pages
+        if not io_backend:
+            IO_BACKEND_PAGE_SIZE_THRESHOLD = 64
+            self.io_backend = (
+                "direct"
+                if self.page_size >= IO_BACKEND_PAGE_SIZE_THRESHOLD
+                else "kernel"
+            )
+        else:
+            self.io_backend = io_backend
         self.load_cache_event = load_cache_event
         self.layer_done_counter = LayerDoneCounter(self.mem_pool_device.layer_num)
@@ -203,12 +213,7 @@ class HiCacheController:
         self.load_stream = torch.cuda.Stream()
         self.write_thread = threading.Thread(
-            target=(
-                self.write_thread_func_buffer
-                if self.page_size == 1
-                else self.write_thread_func_direct
-            ),
-            daemon=True,
+            target=self.write_thread_func_direct, daemon=True
         )
         self.load_thread = threading.Thread(
             target=self.load_thread_func_layer_by_layer, daemon=True
@@ -229,12 +234,7 @@ class HiCacheController:
         self.ack_load_queue.queue.clear()
         self.write_thread = threading.Thread(
-            target=(
-                self.write_thread_func_buffer
-                if self.page_size == 1
-                else self.write_thread_func_direct
-            ),
-            daemon=True,
+            target=self.write_thread_func_direct, daemon=True
         )
         self.load_thread = threading.Thread(
             target=self.load_thread_func_layer_by_layer, daemon=True
@@ -281,6 +281,15 @@ class HiCacheController:
         )
         return device_indices
+    def move_indices(self, host_indices, device_indices):
+        # move indices to GPU if using kernels, to host if using direct indexing
+        if self.io_backend == "kernel":
+            return host_indices.to(self.mem_pool_device.device), device_indices
+        elif self.io_backend == "direct":
+            return host_indices, device_indices.cpu()
+        else:
+            raise ValueError(f"Unsupported io backend")
     def write_thread_func_direct(self):
         """
         Directly write through KV caches to host memory without buffering.
@@ -289,10 +298,14 @@ class HiCacheController:
         while not self.stop_event.is_set():
             try:
                 operation = self.write_queue.get(block=True, timeout=1)
-                self.mem_pool_host.write_page_all_layers(
-                    operation.host_indices,
-                    operation.device_indices,
-                    self.mem_pool_device,
+                host_indices, device_indices = self.move_indices(
+                    operation.host_indices, operation.device_indices
+                )
+                self.mem_pool_device.backup_to_host_all_layer(
+                    self.mem_pool_host,
+                    host_indices,
+                    device_indices,
+                    self.io_backend,
                 )
                 self.write_stream.synchronize()
                 self.mem_pool_host.complete_io(operation.host_indices)
@@ -304,27 +317,6 @@ class HiCacheController:
             except Exception as e:
                 logger.error(e)
-    def load_thread_func_direct(self):
-        """
-        Directly load KV caches from host memory to device memory without buffering.
-        """
-        torch.cuda.set_stream(self.load_stream)
-        while not self.stop_event.is_set():
-            try:
-                operation = self.load_queue.get(block=True, timeout=1)
-                operation.data = self.mem_pool_host.get_flat_data(
-                    operation.host_indices
-                )
-                self.mem_pool_device.transfer(operation.device_indices, operation.data)
-                self.mem_pool_host.complete_io(operation.host_indices)
-                for node_id in operation.node_ids:
-                    if node_id != 0:
-                        self.ack_load_queue.put(node_id)
-            except Empty:
-                continue
-            except Exception as e:
-                logger.error(e)
     def load_thread_func_layer_by_layer(self):
         """
         Load KV caches from host memory to device memory layer by layer.
@@ -349,22 +341,18 @@ class HiCacheController:
             # start layer-wise KV cache transfer from CPU to GPU
             self.layer_done_counter.reset()
+            host_indices, device_indices = self.move_indices(
+                batch_operation.host_indices, batch_operation.device_indices
+            )
             for i in range(self.mem_pool_host.layer_num):
-                if self.page_size == 1:
-                    flat_data = self.mem_pool_host.get_flat_data_by_layer(
-                        batch_operation.host_indices, i
-                    )
-                    self.mem_pool_device.transfer_per_layer(
-                        batch_operation.device_indices, flat_data, i
-                    )
-                else:
-                    self.mem_pool_host.load_page_per_layer(
-                        batch_operation.host_indices,
-                        batch_operation.device_indices,
-                        self.mem_pool_device,
-                        i,
-                    )
-                    self.load_stream.synchronize()
+                self.mem_pool_device.load_from_host_per_layer(
+                    self.mem_pool_host,
+                    host_indices,
+                    device_indices,
+                    i,
+                    self.io_backend,
+                )
+                self.load_stream.synchronize()
                 self.layer_done_counter.increment()
             self.mem_pool_host.complete_io(batch_operation.host_indices)
@@ -372,148 +360,6 @@ class HiCacheController:
                 if node_id != 0:
                     self.ack_load_queue.put(node_id)
-    def write_aux_func(self, no_wait=False):
-        """
-        Auxiliary function to prepare the buffer for write operations.
-        """
-        torch.cuda.set_stream(self.write_stream)
-        def _to_op(op_):
-            assert op_.device_indices.is_cuda, "Device indices should be on GPU"
-            op_.data = self.mem_pool_device.get_flat_data(op_.device_indices).to(
-                self.mem_pool_host.device
-            )
-            self.write_buffer.put(op_)
-            return op_
-        buffer = None
-        while not self.stop_event.is_set():
-            try:
-                operation = self.write_queue.get(block=True, timeout=1)
-                factor = (
-                    len(operation.device_indices) // self.write_buffer.max_buffer_size
-                )
-                if factor >= 1:
-                    if buffer is not None:
-                        _to_op(buffer)
-                        buffer = None
-                    if factor < 2:
-                        _to_op(operation)
-                    else:
-                        split_ops = operation.split(factor)
-                        for op_ in split_ops:
-                            _to_op(op_)
-                    continue
-                if buffer is None:
-                    buffer = operation
-                else:
-                    buffer.merge(operation)
-                if (
-                    no_wait
-                    or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
-                    or self.write_queue.empty()
-                    or self.write_buffer.empty()
-                ):
-                    _to_op(buffer)
-                    buffer = None
-            except Empty:
-                continue
-            except Exception as e:
-                logger.error(e)
-    def load_aux_func(self):
-        """
-        Auxiliary function to prepare the buffer for load operations.
-        """
-        def _pin_op(op_, put=True):
-            op_.data = (
-                self.mem_pool_host.get_flat_data(op_.host_indices)
-                .contiguous()
-                .pin_memory()
-            )
-            if put:
-                self.load_buffer.put(op_)
-            return op_
-        buffer = None
-        while not self.stop_event.is_set():
-            try:
-                operation = self.load_queue.get(block=True, timeout=1)
-                factor = len(operation.host_indices) // self.load_buffer.max_buffer_size
-                if factor >= 1:
-                    if buffer is not None:
-                        _pin_op(buffer)
-                        buffer = None
-                    if factor < 2:
-                        _pin_op(operation)
-                    else:
-                        split_ops = operation.split(factor)
-                        split_args = [(op_, True) for op_ in split_ops[:-1]]
-                        split_args.append((split_ops[-1], False))
-                        # Spawn threads to pin each op concurrently
-                        with concurrent.futures.ThreadPoolExecutor() as executor:
-                            pinned_ops = list(
-                                executor.map(
-                                    lambda x: _pin_op(x[0], put=x[1]), split_args
-                                )
-                            )
-                        # preserve the order of last op to ensure correct ack
-                        self.load_buffer.put(pinned_ops[-1])
-                    continue
-                if buffer is None:
-                    buffer = operation
-                else:
-                    buffer.merge(operation)
-                if (
-                    len(buffer.host_indices) >= self.load_buffer.max_buffer_size
-                    or self.load_queue.empty()
-                    or self.load_buffer.empty()
-                ):
-                    _pin_op(buffer)
-                    buffer = None
-            except Empty:
-                continue
-            except Exception as e:
-                logger.error(e)
-    # todo (zhiqiang): double buffering to be deprecated
-    def write_thread_func_buffer(self):
-        aux_thread = threading.Thread(target=self.write_aux_func, daemon=True)
-        aux_thread.start()
-        while not self.stop_event.is_set():
-            operation = self.write_buffer.get()
-            if operation is None:
-                continue
-            self.mem_pool_host.assign_flat_data(operation.host_indices, operation.data)
-            self.mem_pool_host.complete_io(operation.host_indices)
-            for node_id in operation.node_ids:
-                if node_id != 0:
-                    self.ack_write_queue.put(node_id)
-        aux_thread.join()
-    def load_thread_func_buffer(self):
-        torch.cuda.set_stream(self.load_stream)
-        aux_thread = threading.Thread(target=self.load_aux_func, daemon=True)
-        aux_thread.start()
-        while not self.stop_event.is_set():
-            operation = self.load_buffer.get()
-            if operation is None:
-                continue
-            self.mem_pool_device.transfer(operation.device_indices, operation.data)
-            self.mem_pool_host.complete_io(operation.host_indices)
-            for node_id in operation.node_ids:
-                if node_id != 0:
-                    self.ack_load_queue.put(node_id)
-        aux_thread.join()
     def evict_device(
         self, device_indices: torch.Tensor, host_indices: torch.Tensor
     ) -> int:

sglang/srt/managers/configure_logging.py CHANGED Viewed

@@ -28,7 +28,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--url", type=str, default="http://localhost:30000")
     parser.add_argument("--log-requests", action="store_true")
-    parser.add_argument("--log-requests-level", type=int, default=2)
+    parser.add_argument("--log-requests-level", type=int, default=3)
     parser.add_argument(
         "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
     )

sglang 0.4.8.post1__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.8.post1py3-none-any.whl → 0.4.9.post1py3-none-any.whl