PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

sglang/bench_one_batch.py +113 -17
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +11 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +4 -3
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +71 -0
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/vision.py +13 -5
sglang/srt/layers/communicator.py +21 -4
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +2 -7
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +77 -73
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +3 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +55 -30
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +28 -7
sglang/srt/managers/scheduler.py +26 -12
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +24 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +7 -6
sglang/srt/model_executor/forward_batch_info.py +35 -14
sglang/srt/model_executor/model_runner.py +19 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +72 -33
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +24 -12
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +142 -7
sglang/srt/two_batch_overlap.py +157 -5
sglang/srt/utils.py +38 -2
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -556,7 +556,7 @@ class TokenizerManager:
         if self.server_args.enable_lora and obj.lora_path:
             # Start tracking ongoing requests for LoRA adapters and replace the user-friendly LoRA names in
             # `lora_path` with their corresponding unique LoRA IDs, as required for internal processing.
-            obj.lora_path = await self.lora_registry.acquire(obj.lora_path)
+            obj.lora_id = await self.lora_registry.acquire(obj.lora_path)
         self._validate_one_request(obj, input_ids)
         return self._create_tokenized_object(
@@ -665,7 +665,7 @@ class TokenizerManager:
                 bootstrap_host=obj.bootstrap_host,
                 bootstrap_port=obj.bootstrap_port,
                 bootstrap_room=obj.bootstrap_room,
-                lora_path=obj.lora_path,
+                lora_id=obj.lora_id,
                 input_embeds=input_embeds,
                 session_params=session_params,
                 custom_logit_processor=obj.custom_logit_processor,
@@ -750,7 +750,11 @@ class TokenizerManager:
             try:
                 await asyncio.wait_for(state.event.wait(), timeout=4)
             except asyncio.TimeoutError:
-                if request is not None and await request.is_disconnected():
+                if (
+                    request is not None
+                    and not obj.background
+                    and await request.is_disconnected()
+                ):
                     # Abort the request for disconnected requests (non-streaming, waiting queue)
                     self.abort_request(obj.rid)
                     # Use exception to kill the whole call stack and asyncio task
@@ -773,7 +777,7 @@ class TokenizerManager:
                 # Mark ongoing LoRA request as finished.
                 if self.server_args.enable_lora and obj.lora_path:
-                    await self.lora_registry.release(obj.lora_path)
+                    await self.lora_registry.release(obj.lora_id)
                 # Check if this was an abort/error created by scheduler
                 if isinstance(out["meta_info"].get("finish_reason"), dict):
@@ -805,7 +809,11 @@ class TokenizerManager:
             if obj.stream:
                 yield out
             else:
-                if request is not None and await request.is_disconnected():
+                if (
+                    request is not None
+                    and not obj.background
+                    and await request.is_disconnected()
+                ):
                     # Abort the request for disconnected requests (non-streaming, running)
                     self.abort_request(obj.rid)
                     # Use exception to kill the whole call stack and asyncio task
@@ -1121,6 +1129,7 @@ class TokenizerManager:
                 new_adapter = LoRARef(
                     lora_name=obj.lora_name,
                     lora_path=obj.lora_path,
+                    pinned=obj.pinned,
                 )
                 # Trigger the actual loading operation at the backend processes.
@@ -1178,7 +1187,7 @@ class TokenizerManager:
                 return result
         except ValueError as e:
-            return UnloadLoRAAdapterReqOutput(success=False, rror_message=str(e))
+            return UnloadLoRAAdapterReqOutput(success=False, error_message=str(e))
     async def get_weights_by_name(
         self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
@@ -1548,8 +1557,17 @@ class TokenizerManager:
             if isinstance(recv_obj, BatchStrOut):
                 state.text += recv_obj.output_strs[i]
+                if state.obj.stream:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids[state.last_output_offset :]
+                    state.last_output_offset = len(state.output_ids)
+                else:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids.copy()
                 out_dict = {
                     "text": state.text,
+                    "output_ids": output_token_ids,
                     "meta_info": meta_info,
                 }
             elif isinstance(recv_obj, BatchTokenIDOut):

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -311,3 +311,6 @@ class TpModelWorker:
     def unload_lora_adapter(self, recv_req: UnloadLoRAAdapterReqInput):
         result = self.model_runner.unload_lora_adapter(recv_req.to_ref())
         return result
+    def can_run_lora_batch(self, lora_ids: list[str]) -> bool:
+        return self.model_runner.lora_manager.validate_lora_batch(lora_ids)

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -288,6 +288,9 @@ class TpModelWorkerClient:
     def unload_lora_adapter(self, recv_req: UnloadLoRAAdapterReqInput):
         return self.worker.unload_lora_adapter(recv_req)
+    def can_run_lora_batch(self, lora_ids: list[str]) -> bool:
+        return self.worker.can_run_lora_batch(lora_ids)
     def __delete__(self):
         self.input_queue.put((None, None))
         self.copy_queue.put((None, None, None))

sglang/srt/mem_cache/hiradix_cache.py CHANGED Viewed

@@ -2,11 +2,12 @@ import heapq
 import logging
 import threading
 import time
+from queue import Queue
 from typing import List, Optional
 import torch
-from sglang.srt.managers.cache_controller import HiCacheController
+from sglang.srt.managers.cache_controller import HiCacheController, PrefetchOperation
 from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
 from sglang.srt.mem_cache.base_prefix_cache import MatchResult
 from sglang.srt.mem_cache.memory_pool import (
@@ -37,6 +38,7 @@ class HiRadixCache(RadixCache):
         hicache_io_backend: str,
         hicache_mem_layout: str,
         hicache_storage_backend: Optional[str] = None,
+        hicache_storage_prefetch_policy: Optional[str] = "best_effort",
     ):
         if hicache_io_backend == "direct":
@@ -85,6 +87,13 @@ class HiRadixCache(RadixCache):
             prefetch_threshold=self.prefetch_threshold,
         )
+        self.prefetch_stop_policy = hicache_storage_prefetch_policy
+        # todo: customizable storage prefetch timeout
+        self.prefetch_timeout = 3  # seconds
+        logger.info(
+            f"HiCache storage prefetch policy: {hicache_storage_prefetch_policy}"
+        )
         # record the nodes with ongoing write through
         self.ongoing_write_through = {}
         # record the node segments with ongoing load back
@@ -385,9 +394,10 @@ class HiRadixCache(RadixCache):
         for _ in range(queue_size.item()):
             req_id = self.cache_controller.prefetch_revoke_queue.get()
             if req_id in self.ongoing_prefetch:
-                last_host_node, _, _, _ = self.ongoing_prefetch[req_id]
+                last_host_node, token_ids, _, _ = self.ongoing_prefetch[req_id]
                 last_host_node.release_host()
                 del self.ongoing_prefetch[req_id]
+                self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
             else:
                 # the revoked operation already got terminated
                 pass
@@ -419,10 +429,41 @@ class HiRadixCache(RadixCache):
             host_node.release_host()
             del self.ongoing_backup[ack_id]
-    def check_prefetch_progress(self, req_id: str):
+    def can_terminate_prefetch(self, operation: PrefetchOperation):
+        can_terminate = True
+        if self.prefetch_stop_policy == "best_effort":
+            return can_terminate
+        completed = (
+            operation.completed_tokens == len(operation.hash_value) * self.page_size
+        )
+        if self.prefetch_stop_policy == "wait_complete":
+            can_terminate = completed
+        elif self.prefetch_stop_policy == "timeout":
+            can_terminate = completed or (
+                time.monotonic() - operation.start_time > self.prefetch_timeout
+            )
+        else:
+            # unknown prefetch stop policy, just return True
+            return True
+        if self.tp_world_size > 1:
+            can_terminate = torch.tensor(can_terminate, dtype=torch.int)
+            torch.distributed.all_reduce(
+                can_terminate,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+            can_terminate = bool(can_terminate.item())
+        return can_terminate
+    def check_prefetch_progress(self, req_id: str) -> bool:
         if req_id not in self.ongoing_prefetch:
             # there is no ongoing prefetch for this request or it has been revoked
-            return
+            return True
         # todo: more policies for prefetch progress such as timeout
         # the current policy is to prefetch with best effort and terminate when queuing is over
@@ -430,13 +471,16 @@ class HiRadixCache(RadixCache):
             req_id
         ]
+        if not self.can_terminate_prefetch(operation):
+            return False
         completed_tokens, hash_value = self.cache_controller.terminate_prefetch(
             operation
         )
         logger.debug(f"Prefetch {req_id} completed with {completed_tokens} tokens")
         min_completed_tokens = completed_tokens
-        if self.tp_world_size > 1:
+        if self.tp_world_size > 1 and self.prefetch_stop_policy != "wait_complete":
             # synchrnoize TP workers to make the same update to hiradix cache
             completed_tokens_tensor = torch.tensor(
                 min_completed_tokens, dtype=torch.int
@@ -464,6 +508,9 @@ class HiRadixCache(RadixCache):
         )
         last_host_node.release_host()
         del self.ongoing_prefetch[req_id]
+        self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
+        return True
     def match_prefix(self, key: List[int], **kwargs):
         empty_value = torch.empty((0,), dtype=torch.int64, device=self.device)
@@ -531,6 +578,7 @@ class HiRadixCache(RadixCache):
             host_indices,
             operation,
         )
+        self.cache_controller.prefetch_tokens_occupied += len(new_input_tokens)
     def _insert_helper_host(self, node: TreeNode, key: List, host_value, hash_value):
         node.last_access_time = time.monotonic()

sglang/srt/mem_cache/memory_pool_host.py CHANGED Viewed

@@ -618,7 +618,7 @@ class MLATokenToKVPoolHost(HostKVCache):
             elif self.layout == "page_first":
                 transfer_kv_all_layer_mla_lf_pf(
                     src_layers=device_pool.data_ptrs,
-                    dst_k=self.kv_buffer,
+                    dst=self.kv_buffer,
                     src_indices=device_indices,
                     dst_indices=host_indices,
                     item_size=self.token_stride_size,

sglang/srt/mem_cache/multimodal_cache.py CHANGED Viewed

@@ -1,24 +1,46 @@
+import logging
+from collections import OrderedDict
 from typing import Dict
 import torch
+# Set up logging for cache behavior
+logger = logging.getLogger(__name__)
 class MultiModalCache:
-    """MultiModalCache is used to store vlm encoder results"""
+    """MultiModalCache is used to store vlm encoder results with LRU eviction"""
     def __init__(
         self,
         max_size: int,
     ):
         self.max_size = max_size
-        self.mm_cache: Dict[int, torch.Tensor] = {}
+        self.mm_cache: OrderedDict[int, torch.Tensor] = OrderedDict()
         self.current_size = 0
+    def _allocate(self, embedding_size: int) -> bool:
+        """Allocate space by evicting least recently used entries"""
+        evictions = 0
+        while self.current_size + embedding_size > self.max_size and self.mm_cache:
+            _, old_embedding = self.mm_cache.popitem(last=False)
+            evicted_size = self._get_tensor_size(old_embedding)
+            self.current_size -= evicted_size
+            evictions += evicted_size
+        if evictions > 0:
+            logger.debug(
+                f"Cache eviction: evicted {evictions} bytes, remaining size: {self.current_size}/{self.max_size} bytes"
+            )
+        if self.current_size + embedding_size > self.max_size:
+            return False
+        return True
     def put(self, mm_hash: int, embedding: torch.Tensor) -> bool:
-        if mm_hash in self.mm_cache:
-            return True
         data_size = self._get_tensor_size(embedding)
-        if self.current_size + data_size > self.max_size:
+        # Lazy free cache if not enough space
+        if not self._allocate(data_size):
             return False
         self.mm_cache[mm_hash] = embedding
         self.current_size += data_size
@@ -28,14 +50,12 @@ class MultiModalCache:
         return mm_hash in self.mm_cache
     def get(self, mm_hash: int) -> torch.Tensor:
-        return self.mm_cache.get(mm_hash)
-    def free(self, mm_hash: int) -> bool:
-        if mm_hash not in self.mm_cache:
-            return False
-        old_embedding = self.mm_cache.pop(mm_hash)
-        self.current_size -= self._get_tensor_size(old_embedding)
-        return True
+        """Get embedding and update LRU order"""
+        if mm_hash in self.mm_cache:
+            # Move to end (most recently used)
+            self.mm_cache.move_to_end(mm_hash)
+            return self.mm_cache[mm_hash]
+        return None
     def clear(self):
         self.mm_cache.clear()

sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py CHANGED Viewed

@@ -96,6 +96,8 @@ class Hf3fsClient:
         )
         self.iov_r = make_iovec(self.shm_r, self.hf3fs_mount_point)
         self.iov_w = make_iovec(self.shm_w, self.hf3fs_mount_point)
+        self.shm_r.unlink()
+        self.shm_w.unlink()
         self.rlock = threading.RLock()
         self.wlock = threading.RLock()
@@ -176,8 +178,6 @@ class Hf3fsClient:
         del self.iov_w
         self.shm_r.close()
         self.shm_w.close()
-        self.shm_r.unlink()
-        self.shm_w.unlink()
     def flush(self) -> None:
         os.fsync(self.file)

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -576,11 +576,11 @@ class CudaGraphRunner:
             )
         if self.model_runner.server_args.enable_lora:
-            # It is safe to capture CUDA graph using empty LoRA path, as the LoRA kernels will always be launched whenever
-            # `--enable-lora` is set to True (and return immediately if the LoRA path is empty for perf optimization).
-            lora_paths = [None] * bs
+            # It is safe to capture CUDA graph using empty LoRA id, as the LoRA kernels will always be launched whenever
+            # `--enable-lora` is set to True (and return immediately if the LoRA id is empty for perf optimization).
+            lora_ids = [None] * bs
         else:
-            lora_paths = None
+            lora_ids = None
         forward_batch = ForwardBatch(
             forward_mode=self.capture_forward_mode,
@@ -589,6 +589,7 @@ class CudaGraphRunner:
             req_pool_indices=req_pool_indices,
             seq_lens=seq_lens,
             next_token_logits_buffer=next_token_logits_buffer,
+            orig_seq_lens=seq_lens,
             req_to_token_pool=self.model_runner.req_to_token_pool,
             token_to_kv_pool=self.model_runner.token_to_kv_pool,
             attn_backend=self.model_runner.attn_backend,
@@ -607,11 +608,11 @@ class CudaGraphRunner:
             capture_hidden_mode=self.capture_hidden_mode,
             num_token_non_padded=self.num_token_non_padded,
             global_forward_mode=self.capture_forward_mode,
-            lora_paths=lora_paths,
+            lora_ids=lora_ids,
         )
         self.tbo_plugin.capture_one_batch_size(forward_batch, num_tokens=num_tokens)
-        if lora_paths is not None:
+        if lora_ids is not None:
             self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
         # Attention backend

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -180,6 +180,9 @@ class ForwardBatch:
     # The sum of all sequence lengths
     seq_lens_sum: int
+    # The original sequence length without being chunked. Qwen-1M related.
+    orig_seq_lens: Optional[torch.Tensor] = None
     # Optional seq_lens on cpu
     seq_lens_cpu: Optional[torch.Tensor] = None
@@ -248,7 +251,7 @@ class ForwardBatch:
     encoder_out_cache_loc: Optional[torch.Tensor] = None
     # For LoRA
-    lora_paths: Optional[List[str]] = None
+    lora_ids: Optional[List[str]] = None
     # For input embeddings
     input_embeds: Optional[torch.Tensor] = None
@@ -321,13 +324,14 @@ class ForwardBatch:
             encoder_out_cache_loc=batch.encoder_out_cache_loc,
             seq_lens_sum=batch.seq_lens_sum,
             seq_lens_cpu=batch.seq_lens_cpu,
+            orig_seq_lens=batch.orig_seq_lens,
             return_logprob=batch.return_logprob,
             top_logprobs_nums=batch.top_logprobs_nums,
             token_ids_logprobs=batch.token_ids_logprobs,
             is_extend_in_batch=batch.is_extend_in_batch,
             can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
             global_forward_mode=batch.global_forward_mode,
-            lora_paths=batch.lora_paths,
+            lora_ids=batch.lora_ids,
             sampling_info=batch.sampling_info,
             req_to_token_pool=model_runner.req_to_token_pool,
             token_to_kv_pool=model_runner.token_to_kv_pool,
@@ -420,16 +424,12 @@ class ForwardBatch:
                 batch.extend_prefix_lens, dtype=torch.int32
             ).to(device, non_blocking=True)
             ret.extend_num_tokens = batch.extend_num_tokens
-            if support_triton(model_runner.server_args.attention_backend):
-                positions, ret.extend_start_loc = compute_position_triton(
-                    ret.extend_prefix_lens,
-                    ret.extend_seq_lens,
-                    ret.extend_num_tokens,
-                )
-            else:
-                positions, ret.extend_start_loc = compute_position_torch(
-                    ret.extend_prefix_lens, ret.extend_seq_lens
-                )
+            positions, ret.extend_start_loc = compute_position(
+                model_runner.server_args.attention_backend,
+                ret.extend_prefix_lens,
+                ret.extend_seq_lens,
+                ret.extend_num_tokens,
+            )
             if ret.positions is None:
                 ret.positions = positions
             ret.extend_prefix_lens_cpu = batch.extend_prefix_lens
@@ -632,8 +632,10 @@ class ForwardBatch:
         self.dp_padding_mode = dp_padding_mode
         if dp_padding_mode.is_max_len():
-            # when DP gather mode is all gather, we will use all_gather_into_tensor to gather hidden states,
-            # where transferred tokens should be padded to the same length.
+            # when DP gather mode is all gather, we will use
+            # all_gather_into_tensor to gather hidden states, where transferred
+            # tokens should be padded to the same length. We will also use
+            # reduce-scatter instead of all-reduce after MLP.
             max_num_tokens = max(global_num_tokens)
             global_num_tokens = [max_num_tokens] * sync_group_size
             buffer_len = max_num_tokens * sync_group_size
@@ -882,6 +884,25 @@ class PPProxyTensors:
         return f"PPProxyTensors(tensors={self.tensors})"
+def compute_position(
+    attn_backend: str,
+    extend_prefix_lens: torch.Tensor,
+    extend_seq_lens: torch.Tensor,
+    extend_seq_lens_sum: int,
+):
+    if support_triton(attn_backend):
+        positions, extend_start_loc = compute_position_triton(
+            extend_prefix_lens,
+            extend_seq_lens,
+            extend_seq_lens_sum,
+        )
+    else:
+        positions, extend_start_loc = compute_position_torch(
+            extend_prefix_lens, extend_seq_lens
+        )
+    return positions, extend_start_loc
 def compute_position_triton(
     extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor, extend_seq_lens_sum
 ):

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -1443,19 +1443,36 @@ class ModelRunner:
             )
             return CutlassMLABackend(self)
-        elif self.server_args.attention_backend == "trtllm_mla":
+        elif backend_str == "trtllm_mla":
             if not self.use_mla_backend:
                 raise ValueError("trtllm_mla backend can only be used with MLA models.")
             from sglang.srt.layers.attention.trtllm_mla_backend import TRTLLMMLABackend
             return TRTLLMMLABackend(self)
-        elif self.server_args.attention_backend == "intel_amx":
+        elif backend_str == "trtllm_mha":
+            if self.use_mla_backend:
+                raise ValueError(
+                    "trtllm_mha backend can only be used with non-MLA models."
+                )
+            from sglang.srt.layers.attention.trtllm_mha_backend import (
+                TRTLLMHAAttnBackend,
+            )
+            return TRTLLMHAAttnBackend(self)
+        elif backend_str == "intel_amx":
             from sglang.srt.layers.attention.intel_amx_backend import (
                 IntelAMXAttnBackend,
             )
             logger.info(f"Intel AMX attention backend is enabled.")
             return IntelAMXAttnBackend(self)
+        elif self.server_args.attention_backend == "dual_chunk_flash_attn":
+            from sglang.srt.layers.attention.dual_chunk_flashattention_backend import (
+                DualChunkFlashAttentionBackend,
+            )
+            return DualChunkFlashAttentionBackend(self)
         else:
             raise ValueError(f"Invalid attention backend: {backend_str}")

sglang/srt/model_loader/weight_utils.py CHANGED Viewed

@@ -843,6 +843,16 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
                 return None
             return remapped_name
+    quark_scale_names = {
+        ".q_proj.output_scale": ".attn.q_scale",
+        ".k_proj.output_scale": ".attn.k_scale",
+        ".v_proj.output_scale": ".attn.v_scale",
+        "self_attn.prob_output_scale": ".attn.prob_scale",
+    }
+    for quark_scale_name, sglang_scale_name in quark_scale_names.items():
+        if name.endswith(quark_scale_name):
+            return name.replace(quark_scale_name, sglang_scale_name)
     # If there were no matches, return the untouched param name
     return name

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl