PyPI - sglang - Versions diffs - 0.4.1.post3__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl - Mend

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

sglang/bench_one_batch.py +2 -0
sglang/bench_serving.py +18 -1
sglang/lang/interpreter.py +71 -1
sglang/lang/ir.py +2 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/chatglm.py +78 -0
sglang/srt/configs/dbrx.py +279 -0
sglang/srt/configs/model_config.py +1 -1
sglang/srt/hf_transformers_utils.py +9 -14
sglang/srt/layers/attention/__init__.py +22 -6
sglang/srt/layers/attention/double_sparsity_backend.py +0 -52
sglang/srt/layers/attention/flashinfer_backend.py +215 -83
sglang/srt/layers/attention/torch_native_backend.py +1 -38
sglang/srt/layers/attention/triton_backend.py +20 -11
sglang/srt/layers/attention/triton_ops/decode_attention.py +4 -0
sglang/srt/layers/linear.py +159 -55
sglang/srt/layers/logits_processor.py +170 -215
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +198 -29
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -7
sglang/srt/layers/parameter.py +431 -0
sglang/srt/layers/quantization/__init__.py +3 -2
sglang/srt/layers/quantization/fp8.py +3 -3
sglang/srt/layers/quantization/modelopt_quant.py +174 -0
sglang/srt/layers/sampler.py +57 -21
sglang/srt/layers/torchao_utils.py +17 -3
sglang/srt/layers/vocab_parallel_embedding.py +1 -1
sglang/srt/managers/cache_controller.py +307 -0
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/io_struct.py +1 -2
sglang/srt/managers/schedule_batch.py +33 -3
sglang/srt/managers/schedule_policy.py +159 -90
sglang/srt/managers/scheduler.py +68 -28
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +27 -21
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/memory_pool.py +206 -1
sglang/srt/metrics/collector.py +22 -30
sglang/srt/model_executor/cuda_graph_runner.py +129 -77
sglang/srt/model_executor/forward_batch_info.py +51 -21
sglang/srt/model_executor/model_runner.py +72 -64
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek_v2.py +34 -7
sglang/srt/models/grok.py +109 -29
sglang/srt/models/llama.py +9 -2
sglang/srt/openai_api/adapter.py +0 -17
sglang/srt/openai_api/protocol.py +3 -3
sglang/srt/sampling/sampling_batch_info.py +22 -0
sglang/srt/sampling/sampling_params.py +9 -1
sglang/srt/server.py +20 -13
sglang/srt/server_args.py +120 -58
sglang/srt/speculative/build_eagle_tree.py +347 -0
sglang/srt/speculative/eagle_utils.py +626 -0
sglang/srt/speculative/eagle_worker.py +184 -0
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/utils.py +47 -7
sglang/test/test_programs.py +23 -1
sglang/test/test_utils.py +36 -7
sglang/version.py +1 -1
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/METADATA +12 -12
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/RECORD +86 -57
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/WHEEL +1 -1
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/top_level.txt +0 -0

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Union
+from typing import List
 import torch
 from torch import nn
@@ -28,13 +28,12 @@ class Sampler(nn.Module):
     def forward(
         self,
-        logits: Union[torch.Tensor, LogitsProcessorOutput],
+        logits_output: LogitsProcessorOutput,
         sampling_info: SamplingBatchInfo,
+        return_logprob: bool,
+        top_logprobs_nums: List[int],
     ):
-        if isinstance(logits, LogitsProcessorOutput):
-            logits = logits.next_token_logits
-        logits = logits.contiguous()
+        logits = logits_output.next_token_logits
         if self.use_nan_detectioin and torch.any(torch.isnan(logits)):
             logger.warning("Detected errors during sampling! NaN in the logits.")
@@ -47,6 +46,8 @@ class Sampler(nn.Module):
         if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling
             batch_next_token_ids = torch.argmax(logits, -1)
+            if return_logprob:
+                logprobs = torch.nn.functional.log_softmax(logits, dim=-1)
         else:
             # Post process logits
             logits.div_(sampling_info.temperatures)
@@ -54,6 +55,14 @@ class Sampler(nn.Module):
             del logits
             if global_server_args_dict["sampling_backend"] == "flashinfer":
+                if return_logprob:
+                    # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
+                    # https://github.com/flashinfer-ai/flashinfer/issues/708
+                    # so we use the torch implementation.
+                    logprobs = torch.log(
+                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
+                    )
                 max_top_k_round, batch_size = 32, probs.shape[0]
                 uniform_samples = torch.rand(
                     (max_top_k_round, batch_size), device=probs.device
@@ -76,6 +85,7 @@ class Sampler(nn.Module):
                 if self.use_nan_detectioin and not torch.all(success):
                     logger.warning("Detected errors during sampling!")
                     batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
             elif global_server_args_dict["sampling_backend"] == "pytorch":
                 # A slower fallback implementation with torch native operations.
                 batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
@@ -85,12 +95,31 @@ class Sampler(nn.Module):
                     sampling_info.min_ps,
                     sampling_info.need_min_p_sampling,
                 )
+                if return_logprob:
+                    logprobs = torch.log(
+                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
+                    )
             else:
                 raise ValueError(
                     f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                 )
-        return batch_next_token_ids.to(torch.int32)
+        batch_next_token_ids = batch_next_token_ids.to(torch.int32)
+        # Attach logprobs to logits_output (in-place modification)
+        if return_logprob:
+            if any(x > 0 for x in top_logprobs_nums):
+                (
+                    logits_output.next_token_top_logprobs_val,
+                    logits_output.next_token_top_logprobs_idx,
+                ) = get_top_logprobs(logprobs, top_logprobs_nums)
+            logits_output.next_token_logprobs = logprobs[
+                torch.arange(len(batch_next_token_ids), device=sampling_info.device),
+                batch_next_token_ids,
+            ]
+        return batch_next_token_ids
 def top_k_top_p_min_p_sampling_from_probs_torch(
@@ -120,20 +149,27 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     return batch_next_token_ids
-def top_p_normalize_probs(
+def top_p_normalize_probs_torch(
     probs: torch.Tensor,
     top_ps: torch.Tensor,
 ):
-    if global_server_args_dict["sampling_backend"] == "flashinfer":
-        return top_p_renorm_prob(probs, top_ps)
-    elif global_server_args_dict["sampling_backend"] == "pytorch":
-        # See also top_k_top_p_min_p_sampling_from_probs_torch
-        probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
-        probs_sum = torch.cumsum(probs_sort, dim=-1)
-        probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
-        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-        return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
-    else:
-        raise ValueError(
-            f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
-        )
+    # See also top_k_top_p_min_p_sampling_from_probs_torch
+    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
+def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
+    max_k = max(top_logprobs_nums)
+    ret = logprobs.topk(max_k, dim=1)
+    values = ret.values.tolist()
+    indices = ret.indices.tolist()
+    output_top_logprobs_val = []
+    output_top_logprobs_idx = []
+    for i, k in enumerate(top_logprobs_nums):
+        output_top_logprobs_val.append(values[i][:k])
+        output_top_logprobs_idx.append(indices[i][:k])
+    return output_top_logprobs_val, output_top_logprobs_idx

sglang/srt/layers/torchao_utils.py CHANGED Viewed

@@ -11,6 +11,22 @@ import torch
 logger = logging.getLogger(__name__)
+def get_gemlite_cache_path() -> str:
+    return f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+def save_gemlite_cache(print_error: bool = False) -> bool:
+    try:
+        from gemlite.core import GemLiteLinearTriton
+        GemLiteLinearTriton.cache_config(get_gemlite_cache_path())
+    except Exception:
+        if print_error:
+            logger.error("Failed to save the GemLite cache.")
+        return False
+    return True
 def apply_torchao_config_to_model(
     model: torch.nn.Module, torchao_config: str, filter_fn=None
 ):
@@ -74,9 +90,7 @@ def apply_torchao_config_to_model(
         )
         # try to load gemlite kernel config
-        GemLiteLinearTriton.load_config(
-            f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
-        )
+        GemLiteLinearTriton.load_config(get_gemlite_cache_path())
     elif "fp8wo" in torchao_config:
         # this requires newer hardware

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -12,8 +12,8 @@ from vllm.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.parameter import BasevLLMParameter
+from sglang.srt.layers.parameter import BasevLLMParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,

sglang/srt/managers/cache_controller.py ADDED Viewed

@@ -0,0 +1,307 @@
+from __future__ import annotations
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import logging
+import threading
+from queue import PriorityQueue, Queue
+from typing import Optional
+import torch
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPoolHost
+logger = logging.getLogger(__name__)
+class CacheOperation:
+    counter = 0
+    def __init__(
+        self,
+        host_indices: torch.Tensor,
+        device_indices: torch.Tensor,
+        node_id: int,
+        priority: Optional[int] = None,
+    ):
+        self.host_indices = host_indices
+        self.device_indices = device_indices
+        self.node_ids = [node_id]
+        self.data = None
+        self.id = CacheOperation.counter
+        CacheOperation.counter += 1
+        # default priority is the order of creation
+        self.priority = priority if priority is not None else self.id
+    def merge(self, other: "CacheOperation") -> None:
+        # multiple operations can be merged into a single operation for batch processing
+        self.host_indices = torch.cat([self.host_indices, other.host_indices])
+        self.device_indices = torch.cat([self.device_indices, other.device_indices])
+        self.priority = min(self.priority, other.priority)
+        self.node_ids.extend(other.node_ids)
+    def __lt__(self, other: "CacheOperation"):
+        return self.priority < other.priority
+class TransferBuffer:
+    """
+    Overlapping buffer preparation and transfer operations to improve throughput.
+    """
+    def __init__(self, buffer_count: int = 3, max_buffer_size: int = 1000) -> None:
+        self.buffers = Queue(maxsize=buffer_count)
+        # todo: adjust the buffer size based on throughput profile of the system
+        self.max_buffer_size = max_buffer_size
+    def full(self) -> bool:
+        return self.buffers.full()
+    def empty(self) -> bool:
+        return self.buffers.empty()
+    def put(self, item, block=True) -> None:
+        self.buffers.put(item, block=block)
+    def get(self, block=True) -> Optional[CacheOperation]:
+        try:
+            return self.buffers.get(block=block)
+        except Exception as e:
+            logger.error(e)
+class HiCacheController:
+    def __init__(
+        self,
+        mem_pool_device: MHATokenToKVPool,
+        mem_pool_host: MLATokenToKVPoolHost,
+        write_policy: str = "write_through_selective",
+    ):
+        self.mem_pool_device = mem_pool_device
+        self.mem_pool_host = mem_pool_host
+        self.write_policy = write_policy
+        if write_policy not in [
+            "write_through",
+            "write_through_selective",
+            "write_back",
+        ]:
+            raise ValueError(f"Invalid write policy: {write_policy}")
+        self.write_queue = PriorityQueue()
+        self.load_queue = PriorityQueue()
+        self.ack_write_queue = Queue()
+        self.ack_load_queue = Queue()
+        self.write_buffer = TransferBuffer()
+        self.load_buffer = TransferBuffer()
+        self.write_stream = torch.cuda.Stream()
+        self.load_stream = torch.cuda.Stream()
+        self.write_thread = threading.Thread(
+            target=self.write_thread_func_buffer, daemon=True
+        )
+        self.load_thread = threading.Thread(
+            target=self.load_thread_func_buffer, daemon=True
+        )
+        self.write_thread.start()
+        self.load_thread.start()
+    def write(
+        self,
+        device_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = 0,
+    ) -> Optional[torch.Tensor]:
+        """
+        Back up KV caches from device memory to host memory.
+        """
+        host_indices = self.mem_pool_host.alloc(len(device_indices))
+        if host_indices is None:
+            return None
+        self.write_queue.put(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        self.mem_pool_host.protect_write(host_indices)
+        return host_indices
+    def load(
+        self,
+        host_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = 0,
+    ) -> Optional[torch.Tensor]:
+        """
+        Load KV caches from host memory to device memory.
+        """
+        device_indices = self.mem_pool_device.alloc(len(host_indices))
+        if device_indices is None:
+            return None
+        self.load_queue.put(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        self.mem_pool_host.protect_load(host_indices)
+        return device_indices
+    def write_thread_func_direct(self):
+        """
+        Directly write through KV caches to host memory without buffering.
+        """
+        with torch.cuda.stream(self.write_stream):
+            while True:
+                try:
+                    operation = self.write_queue.get(block=True)
+                    operation.data = self.mem_pool_device.get_flat_data(
+                        operation.device_indices
+                    )
+                    self.mem_pool_host.transfer(operation.host_indices, operation.data)
+                    self.mem_pool_host.complete_io(operation.host_indices)
+                    for node_id in operation.node_ids:
+                        self.ack_write_queue.put(node_id)
+                except Exception as e:
+                    logger.error(e)
+    def load_thread_func_direct(self):
+        """
+        Directly load KV caches from host memory to device memory without buffering.
+        """
+        with torch.cuda.stream(self.load_stream):
+            while True:
+                try:
+                    operation = self.load_queue.get(block=True)
+                    operation.data = self.mem_pool_host.get_flat_data(
+                        operation.host_indices
+                    )
+                    self.mem_pool_device.transfer(
+                        operation.device_indices, operation.data
+                    )
+                    self.mem_pool_host.complete_io(operation.host_indices)
+                    for node_id in operation.node_ids:
+                        self.ack_load_queue.put(node_id)
+                except Exception as e:
+                    logger.error(e)
+    def write_aux_func(self, no_wait=False):
+        """
+        Auxiliary function to prepare the buffer for write operations.
+        """
+        buffer = None
+        while True:
+            try:
+                operation = self.write_queue.get(block=True)
+                if buffer is None:
+                    buffer = operation
+                else:
+                    buffer.merge(operation)
+                if (
+                    no_wait
+                    or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
+                    or self.write_queue.empty()
+                    or self.write_buffer.empty()
+                ):
+                    assert (
+                        buffer.device_indices.is_cuda
+                    ), "Device indices should be on GPU"
+                    buffer.data = self.mem_pool_device.get_flat_data(
+                        buffer.device_indices
+                    ).contiguous()
+                    self.write_buffer.put(buffer, block=True)
+                    buffer = None
+            except Exception as e:
+                logger.error(e)
+    def load_aux_func(self):
+        """
+        Auxiliary function to prepare the buffer for load operations.
+        """
+        buffer = None
+        while True:
+            try:
+                operation = self.load_queue.get(block=True)
+                if buffer is None:
+                    buffer = operation
+                else:
+                    buffer.merge(operation)
+                if (
+                    len(buffer.host_indices) >= self.load_buffer.max_buffer_size
+                    or self.load_queue.empty()
+                    or self.load_buffer.empty()
+                ):
+                    buffer.data = (
+                        self.mem_pool_host.get_flat_data(buffer.host_indices)
+                        .contiguous()
+                        .pin_memory()
+                    )
+                    self.load_buffer.put(buffer, block=True)
+                    buffer = None
+            except Exception as e:
+                logger.error(e)
+    def write_thread_func_buffer(self):
+        aux_thread = threading.Thread(target=self.write_aux_func, daemon=True)
+        aux_thread.start()
+        with torch.cuda.stream(self.write_stream):
+            while True:
+                operation = self.write_buffer.get()
+                if operation is None:
+                    continue
+                self.mem_pool_host.transfer(operation.host_indices, operation.data)
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    self.ack_write_queue.put(node_id)
+    def load_thread_func_buffer(self):
+        aux_thread = threading.Thread(target=self.load_aux_func, daemon=True)
+        aux_thread.start()
+        with torch.cuda.stream(self.load_stream):
+            while True:
+                operation = self.load_buffer.get()
+                if operation is None:
+                    continue
+                self.mem_pool_device.transfer(operation.device_indices, operation.data)
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    self.ack_load_queue.put(node_id)
+    def evict_device(
+        self, device_indices: torch.Tensor, host_indices: torch.Tensor
+    ) -> int:
+        if self.mem_pool_host.is_synced(host_indices):
+            self.mem_pool_device.free(device_indices)
+            self.mem_pool_host.update_backup(host_indices)
+            return len(device_indices)
+        else:
+            raise ValueError(
+                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
+            )
+    def evict_host(self, host_indices: torch.Tensor, backup_only: bool = True) -> int:
+        if not backup_only:
+            raise ValueError("Other eviction policies are not supported yet.")
+        if self.mem_pool_host.is_backup(host_indices):
+            self.mem_pool_host.free(host_indices)
+            return len(host_indices)
+        else:
+            raise ValueError(
+                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
+            )

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -20,6 +20,7 @@ import threading
 from enum import Enum, auto
 import psutil
+import setproctitle
 import zmq
 from sglang.srt.managers.io_struct import (
@@ -230,6 +231,7 @@ def run_data_parallel_controller_process(
     port_args: PortArgs,
     pipe_writer,
 ):
+    setproctitle.setproctitle("sglang::data_parallel_controller")
     configure_logger(server_args)
     parent_process = psutil.Process().parent()

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -426,8 +426,7 @@ class UpdateWeightsFromDistributedReqOutput:
 @dataclass
 class UpdateWeightsFromTensorReqInput:
-    name: str
-    tensor: torch.Tensor
+    serialized_named_tensors: bytes  # indeed Dict[str, torch.Tensor]
 @dataclass

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 # Copyright 2023-2024 SGLang Team
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,7 +31,7 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
 import dataclasses
 import logging
-from typing import List, Optional, Set, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
 import numpy as np
 import torch
@@ -42,11 +44,15 @@ from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
-from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
+if TYPE_CHECKING:
+    from sglang.srt.speculative.spec_info import SpecInfo, SpeculativeAlgorithm
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 # Put some global args for easy access
@@ -565,9 +571,13 @@ class ScheduleBatch:
     # Has grammar
     has_grammar: bool = False
-    # device
+    # Device
     device: str = "cuda"
+    # Speculative decoding
+    spec_algorithm: SpeculativeAlgorithm = None
+    spec_info: Optional[SpecInfo] = None
     @classmethod
     def init_new(
         cls,
@@ -577,6 +587,7 @@ class ScheduleBatch:
         tree_cache: BasePrefixCache,
         model_config: ModelConfig,
         enable_overlap: bool,
+        spec_algorithm: SpeculativeAlgorithm,
     ):
         return cls(
             reqs=reqs,
@@ -589,6 +600,7 @@ class ScheduleBatch:
             has_stream=any(req.stream for req in reqs),
             has_grammar=any(req.grammar for req in reqs),
             device=req_to_token_pool.device,
+            spec_algorithm=spec_algorithm,
         )
     def batch_size(self):
@@ -998,6 +1010,8 @@ class ScheduleBatch:
     def prepare_for_decode(self):
         self.forward_mode = ForwardMode.DECODE
+        if self.spec_algorithm.is_eagle():
+            return
         self.input_ids = self.output_ids
         self.output_ids = None
@@ -1103,6 +1117,9 @@ class ScheduleBatch:
         self.has_stream |= other.has_stream
         self.has_grammar |= other.has_grammar
+        if self.spec_info:
+            self.spec_info.merge_batch(other.spec_info)
     def get_model_worker_batch(self):
         if self.forward_mode.is_decode() or self.forward_mode.is_idle():
             extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
@@ -1144,6 +1161,13 @@ class ScheduleBatch:
             lora_paths=[req.lora_path for req in self.reqs],
             sampling_info=self.sampling_info,
             input_embeds=self.input_embeds,
+            spec_algorithm=self.spec_algorithm,
+            spec_info=self.spec_info,
+            capture_hidden_mode=(
+                getattr(self.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL)
+                if self.spec_info
+                else CaptureHiddenMode.NULL
+            ),
         )
     def copy(self):
@@ -1155,6 +1179,7 @@ class ScheduleBatch:
             out_cache_loc=self.out_cache_loc,
             return_logprob=self.return_logprob,
             decoding_reqs=self.decoding_reqs,
+            spec_algorithm=self.spec_algorithm,
         )
     def __str__(self):
@@ -1214,6 +1239,11 @@ class ModelWorkerBatch:
     # The input Embeds
     input_embeds: Optional[torch.tensor] = None
+    # Speculative decoding
+    spec_algorithm: SpeculativeAlgorithm = None
+    spec_info: Optional[SpecInfo] = None
+    capture_hidden_mode: CaptureHiddenMode = None
 @triton.jit
 def write_req_to_token_pool_triton(

sglang 0.4.1.post3__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post5py3-none-any.whl