PyPI - sglang - Versions diffs - 0.4.1.post4__py3-none-any.whl → 0.4.1.post6__py3-none-any.whl - Mend

sglang 0.4.1.post4py3-none-any.whl → 0.4.1.post6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sglang/bench_serving.py +18 -1
sglang/lang/interpreter.py +71 -1
sglang/lang/ir.py +2 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/chatglm.py +78 -0
sglang/srt/configs/dbrx.py +279 -0
sglang/srt/configs/model_config.py +16 -7
sglang/srt/hf_transformers_utils.py +9 -14
sglang/srt/layers/attention/__init__.py +8 -1
sglang/srt/layers/attention/flashinfer_backend.py +21 -5
sglang/srt/layers/linear.py +89 -47
sglang/srt/layers/logits_processor.py +6 -6
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +16 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +39 -12
sglang/srt/layers/moe/topk.py +4 -2
sglang/srt/layers/parameter.py +439 -0
sglang/srt/layers/quantization/__init__.py +5 -2
sglang/srt/layers/quantization/fp8.py +107 -53
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/int8_kernel.py +54 -0
sglang/srt/layers/quantization/modelopt_quant.py +174 -0
sglang/srt/layers/quantization/w8a8_int8.py +117 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/vocab_parallel_embedding.py +16 -3
sglang/srt/managers/cache_controller.py +307 -0
sglang/srt/managers/configure_logging.py +43 -0
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/detokenizer_manager.py +0 -2
sglang/srt/managers/io_struct.py +29 -13
sglang/srt/managers/schedule_batch.py +7 -1
sglang/srt/managers/scheduler.py +58 -15
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +109 -45
sglang/srt/mem_cache/memory_pool.py +313 -53
sglang/srt/metrics/collector.py +32 -35
sglang/srt/model_executor/cuda_graph_runner.py +14 -7
sglang/srt/model_executor/forward_batch_info.py +20 -15
sglang/srt/model_executor/model_runner.py +53 -10
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/grok.py +25 -16
sglang/srt/models/llama.py +46 -4
sglang/srt/models/qwen2.py +11 -0
sglang/srt/models/qwen2_eagle.py +131 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
sglang/srt/sampling/sampling_batch_info.py +15 -5
sglang/srt/sampling/sampling_params.py +1 -1
sglang/srt/server.py +125 -69
sglang/srt/server_args.py +39 -19
sglang/srt/speculative/eagle_utils.py +93 -85
sglang/srt/speculative/eagle_worker.py +48 -33
sglang/srt/torch_memory_saver_adapter.py +59 -0
sglang/srt/utils.py +61 -5
sglang/test/test_programs.py +23 -1
sglang/test/test_utils.py +36 -7
sglang/version.py +1 -1
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/METADATA +16 -15
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/RECORD +61 -51
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/WHEEL +1 -1
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py ADDED Viewed

@@ -0,0 +1,307 @@
+from __future__ import annotations
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import logging
+import threading
+from queue import PriorityQueue, Queue
+from typing import Optional
+import torch
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPoolHost
+logger = logging.getLogger(__name__)
+class CacheOperation:
+    counter = 0
+    def __init__(
+        self,
+        host_indices: torch.Tensor,
+        device_indices: torch.Tensor,
+        node_id: int,
+        priority: Optional[int] = None,
+    ):
+        self.host_indices = host_indices
+        self.device_indices = device_indices
+        self.node_ids = [node_id]
+        self.data = None
+        self.id = CacheOperation.counter
+        CacheOperation.counter += 1
+        # default priority is the order of creation
+        self.priority = priority if priority is not None else self.id
+    def merge(self, other: "CacheOperation") -> None:
+        # multiple operations can be merged into a single operation for batch processing
+        self.host_indices = torch.cat([self.host_indices, other.host_indices])
+        self.device_indices = torch.cat([self.device_indices, other.device_indices])
+        self.priority = min(self.priority, other.priority)
+        self.node_ids.extend(other.node_ids)
+    def __lt__(self, other: "CacheOperation"):
+        return self.priority < other.priority
+class TransferBuffer:
+    """
+    Overlapping buffer preparation and transfer operations to improve throughput.
+    """
+    def __init__(self, buffer_count: int = 3, max_buffer_size: int = 1000) -> None:
+        self.buffers = Queue(maxsize=buffer_count)
+        # todo: adjust the buffer size based on throughput profile of the system
+        self.max_buffer_size = max_buffer_size
+    def full(self) -> bool:
+        return self.buffers.full()
+    def empty(self) -> bool:
+        return self.buffers.empty()
+    def put(self, item, block=True) -> None:
+        self.buffers.put(item, block=block)
+    def get(self, block=True) -> Optional[CacheOperation]:
+        try:
+            return self.buffers.get(block=block)
+        except Exception as e:
+            logger.error(e)
+class HiCacheController:
+    def __init__(
+        self,
+        mem_pool_device: MHATokenToKVPool,
+        mem_pool_host: MLATokenToKVPoolHost,
+        write_policy: str = "write_through_selective",
+    ):
+        self.mem_pool_device = mem_pool_device
+        self.mem_pool_host = mem_pool_host
+        self.write_policy = write_policy
+        if write_policy not in [
+            "write_through",
+            "write_through_selective",
+            "write_back",
+        ]:
+            raise ValueError(f"Invalid write policy: {write_policy}")
+        self.write_queue = PriorityQueue()
+        self.load_queue = PriorityQueue()
+        self.ack_write_queue = Queue()
+        self.ack_load_queue = Queue()
+        self.write_buffer = TransferBuffer()
+        self.load_buffer = TransferBuffer()
+        self.write_stream = torch.cuda.Stream()
+        self.load_stream = torch.cuda.Stream()
+        self.write_thread = threading.Thread(
+            target=self.write_thread_func_buffer, daemon=True
+        )
+        self.load_thread = threading.Thread(
+            target=self.load_thread_func_buffer, daemon=True
+        )
+        self.write_thread.start()
+        self.load_thread.start()
+    def write(
+        self,
+        device_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = 0,
+    ) -> Optional[torch.Tensor]:
+        """
+        Back up KV caches from device memory to host memory.
+        """
+        host_indices = self.mem_pool_host.alloc(len(device_indices))
+        if host_indices is None:
+            return None
+        self.write_queue.put(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        self.mem_pool_host.protect_write(host_indices)
+        return host_indices
+    def load(
+        self,
+        host_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = 0,
+    ) -> Optional[torch.Tensor]:
+        """
+        Load KV caches from host memory to device memory.
+        """
+        device_indices = self.mem_pool_device.alloc(len(host_indices))
+        if device_indices is None:
+            return None
+        self.load_queue.put(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        self.mem_pool_host.protect_load(host_indices)
+        return device_indices
+    def write_thread_func_direct(self):
+        """
+        Directly write through KV caches to host memory without buffering.
+        """
+        with torch.cuda.stream(self.write_stream):
+            while True:
+                try:
+                    operation = self.write_queue.get(block=True)
+                    operation.data = self.mem_pool_device.get_flat_data(
+                        operation.device_indices
+                    )
+                    self.mem_pool_host.transfer(operation.host_indices, operation.data)
+                    self.mem_pool_host.complete_io(operation.host_indices)
+                    for node_id in operation.node_ids:
+                        self.ack_write_queue.put(node_id)
+                except Exception as e:
+                    logger.error(e)
+    def load_thread_func_direct(self):
+        """
+        Directly load KV caches from host memory to device memory without buffering.
+        """
+        with torch.cuda.stream(self.load_stream):
+            while True:
+                try:
+                    operation = self.load_queue.get(block=True)
+                    operation.data = self.mem_pool_host.get_flat_data(
+                        operation.host_indices
+                    )
+                    self.mem_pool_device.transfer(
+                        operation.device_indices, operation.data
+                    )
+                    self.mem_pool_host.complete_io(operation.host_indices)
+                    for node_id in operation.node_ids:
+                        self.ack_load_queue.put(node_id)
+                except Exception as e:
+                    logger.error(e)
+    def write_aux_func(self, no_wait=False):
+        """
+        Auxiliary function to prepare the buffer for write operations.
+        """
+        buffer = None
+        while True:
+            try:
+                operation = self.write_queue.get(block=True)
+                if buffer is None:
+                    buffer = operation
+                else:
+                    buffer.merge(operation)
+                if (
+                    no_wait
+                    or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
+                    or self.write_queue.empty()
+                    or self.write_buffer.empty()
+                ):
+                    assert (
+                        buffer.device_indices.is_cuda
+                    ), "Device indices should be on GPU"
+                    buffer.data = self.mem_pool_device.get_flat_data(
+                        buffer.device_indices
+                    ).contiguous()
+                    self.write_buffer.put(buffer, block=True)
+                    buffer = None
+            except Exception as e:
+                logger.error(e)
+    def load_aux_func(self):
+        """
+        Auxiliary function to prepare the buffer for load operations.
+        """
+        buffer = None
+        while True:
+            try:
+                operation = self.load_queue.get(block=True)
+                if buffer is None:
+                    buffer = operation
+                else:
+                    buffer.merge(operation)
+                if (
+                    len(buffer.host_indices) >= self.load_buffer.max_buffer_size
+                    or self.load_queue.empty()
+                    or self.load_buffer.empty()
+                ):
+                    buffer.data = (
+                        self.mem_pool_host.get_flat_data(buffer.host_indices)
+                        .contiguous()
+                        .pin_memory()
+                    )
+                    self.load_buffer.put(buffer, block=True)
+                    buffer = None
+            except Exception as e:
+                logger.error(e)
+    def write_thread_func_buffer(self):
+        aux_thread = threading.Thread(target=self.write_aux_func, daemon=True)
+        aux_thread.start()
+        with torch.cuda.stream(self.write_stream):
+            while True:
+                operation = self.write_buffer.get()
+                if operation is None:
+                    continue
+                self.mem_pool_host.transfer(operation.host_indices, operation.data)
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    self.ack_write_queue.put(node_id)
+    def load_thread_func_buffer(self):
+        aux_thread = threading.Thread(target=self.load_aux_func, daemon=True)
+        aux_thread.start()
+        with torch.cuda.stream(self.load_stream):
+            while True:
+                operation = self.load_buffer.get()
+                if operation is None:
+                    continue
+                self.mem_pool_device.transfer(operation.device_indices, operation.data)
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    self.ack_load_queue.put(node_id)
+    def evict_device(
+        self, device_indices: torch.Tensor, host_indices: torch.Tensor
+    ) -> int:
+        if self.mem_pool_host.is_synced(host_indices):
+            self.mem_pool_device.free(device_indices)
+            self.mem_pool_host.update_backup(host_indices)
+            return len(device_indices)
+        else:
+            raise ValueError(
+                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
+            )
+    def evict_host(self, host_indices: torch.Tensor, backup_only: bool = True) -> int:
+        if not backup_only:
+            raise ValueError("Other eviction policies are not supported yet.")
+        if self.mem_pool_host.is_backup(host_indices):
+            self.mem_pool_host.free(host_indices)
+            return len(host_indices)
+        else:
+            raise ValueError(
+                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
+            )

sglang/srt/managers/configure_logging.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""
+Configure the logging settings of a server.
+Usage:
+python3 -m sglang.srt.managers.configure_logging --url http://localhost:30000
+"""
+import argparse
+import requests
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    parser.add_argument(
+        "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
+    )
+    parser.add_argument("--dump-requests-threshold", type=int, default=1000)
+    args = parser.parse_args()
+    response = requests.post(
+        args.url + "/configure_logging",
+        json={
+            "dump_requests_folder": args.dump_requests_folder,
+            "dump_requests_threshold": args.dump_requests_threshold,
+        },
+    )
+    assert response.status_code == 200

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -20,6 +20,7 @@ import threading
 from enum import Enum, auto
 import psutil
+import setproctitle
 import zmq
 from sglang.srt.managers.io_struct import (
@@ -230,6 +231,7 @@ def run_data_parallel_controller_process(
     port_args: PortArgs,
     pipe_writer,
 ):
+    setproctitle.setproctitle("sglang::data_parallel_controller")
     configure_logger(server_args)
     parent_process = psutil.Process().parent()

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -181,8 +181,6 @@ class DetokenizerManager:
                     finished_reasons=recv_obj.finished_reasons,
                     output_strs=output_strs,
                     prompt_tokens=recv_obj.prompt_tokens,
-                    origin_input_ids=recv_obj.origin_input_ids,
-                    output_ids=recv_obj.output_ids,
                     completion_tokens=recv_obj.completion_tokens,
                     cached_tokens=recv_obj.cached_tokens,
                     input_token_logprobs_val=recv_obj.input_token_logprobs_val,

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -19,9 +19,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Tuple, Union
-import torch
+from typing import Dict, List, Optional, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -323,9 +321,7 @@ class BatchTokenIDOut:
     decoded_texts: List[str]
     decode_ids: List[int]
     read_offsets: List[int]
-    # Only used when --return-token-ids` is set
-    origin_input_ids: Optional[List[int]]
-    # Only used when `--skip-tokenizer-init` or `--return-token-ids` is set
+    # Only used when `--skip-tokenizer-init` is on
     output_ids: Optional[List[int]]
     # Detokenization configs
     skip_special_tokens: List[bool]
@@ -356,14 +352,7 @@ class BatchStrOut:
     # The output decoded strings
     output_strs: List[str]
-    # The token ids
-    origin_input_ids: Optional[List[int]]
-    output_ids: Optional[List[int]]
     # Token counts
-    # real input and output tokens can be get from
-    # origin_input_ids and output_ids by enabling --return_token_ids
-    # TODO (Shuai): Rename this to clarify the meaning.
     prompt_tokens: List[int]
     completion_tokens: List[int]
     cached_tokens: List[int]
@@ -468,6 +457,26 @@ class GetWeightsByNameReqOutput:
     parameter: list
+@dataclass
+class ReleaseMemoryOccupationReqInput:
+    pass
+@dataclass
+class ReleaseMemoryOccupationReqOutput:
+    pass
+@dataclass
+class ResumeMemoryOccupationReqInput:
+    pass
+@dataclass
+class ResumeMemoryOccupationReqOutput:
+    pass
 @dataclass
 class AbortReq:
     # The request id
@@ -479,6 +488,13 @@ class ProfileReq(Enum):
     STOP_PROFILE = 2
+@dataclass
+class ConfigureLoggingReq:
+    log_requests: Optional[bool] = None
+    dump_requests_folder: Optional[str] = None
+    dump_requests_threshold: Optional[int] = None
 @dataclass
 class OpenSessionReqInput:
     capacity_of_str_len: int

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -44,7 +44,7 @@ from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
-from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
@@ -1163,6 +1163,11 @@ class ScheduleBatch:
             input_embeds=self.input_embeds,
             spec_algorithm=self.spec_algorithm,
             spec_info=self.spec_info,
+            capture_hidden_mode=(
+                getattr(self.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL)
+                if self.spec_info
+                else CaptureHiddenMode.NULL
+            ),
         )
     def copy(self):
@@ -1237,6 +1242,7 @@ class ModelWorkerBatch:
     # Speculative decoding
     spec_algorithm: SpeculativeAlgorithm = None
     spec_info: Optional[SpecInfo] = None
+    capture_hidden_mode: CaptureHiddenMode = None
 @triton.jit

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # ==============================================================================
 """A scheduler that manages a tensor parallel GPU worker."""
+import faulthandler
 import logging
 import os
 import signal
@@ -46,6 +47,10 @@ from sglang.srt.managers.io_struct import (
     OpenSessionReqInput,
     OpenSessionReqOutput,
     ProfileReq,
+    ReleaseMemoryOccupationReqInput,
+    ReleaseMemoryOccupationReqOutput,
+    ResumeMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqOutput,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
     UpdateWeightFromDiskReqInput,
@@ -77,6 +82,7 @@ from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerSta
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
     broadcast_pyobj,
     configure_logger,
@@ -356,6 +362,10 @@ class Scheduler:
         t.start()
         self.parent_process = psutil.Process().parent()
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
         # Init profiler
         if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
             self.profiler = None
@@ -399,6 +409,8 @@ class Scheduler:
                     self.watchdog_last_time = time.time()
             time.sleep(self.watchdog_timeout / 2)
+        # Wait sometimes so that the parent process can print the error.
+        time.sleep(5)
         self.parent_process.send_signal(signal.SIGQUIT)
     @torch.no_grad()
@@ -516,6 +528,12 @@ class Scheduler:
             elif isinstance(recv_req, GetWeightsByNameReqInput):
                 parameter = self.get_weights_by_name(recv_req)
                 self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
+            elif isinstance(recv_req, ReleaseMemoryOccupationReqInput):
+                self.release_memory_occupation()
+                self.send_to_tokenizer.send_pyobj(ReleaseMemoryOccupationReqOutput())
+            elif isinstance(recv_req, ResumeMemoryOccupationReqInput):
+                self.resume_memory_occupation()
+                self.send_to_tokenizer.send_pyobj(ResumeMemoryOccupationReqOutput())
             elif isinstance(recv_req, ProfileReq):
                 if recv_req == ProfileReq.START_PROFILE:
                     self.start_profile()
@@ -962,10 +980,13 @@ class Scheduler:
                         self.tp_worker.forward_batch_generation(model_worker_batch)
                     )
                 else:
-                    logits_output, next_token_ids, model_worker_batch, spec_info = (
-                        self.draft_worker.forward_batch_speculative_generation(batch)
-                    )
-                    batch.spec_info = spec_info
+                    (
+                        logits_output,
+                        next_token_ids,
+                        model_worker_batch,
+                        num_accepted_tokens,
+                    ) = self.draft_worker.forward_batch_speculative_generation(batch)
+                    self.num_generated_tokens += num_accepted_tokens
             elif batch.forward_mode.is_idle():
                 model_worker_batch = batch.get_model_worker_batch()
                 self.tp_worker.forward_batch_idle(model_worker_batch)
@@ -1250,7 +1271,6 @@ class Scheduler:
             decode_ids_list = []
             read_offsets = []
             output_ids = []
-            origin_input_ids = []
             skip_special_tokens = []
             spaces_between_special_tokens = []
@@ -1302,14 +1322,8 @@ class Scheduler:
                     decode_ids, read_offset = req.init_incremental_detokenize()
                     decode_ids_list.append(decode_ids)
                     read_offsets.append(read_offset)
-                    if self.skip_tokenizer_init or self.server_args.return_token_ids:
+                    if self.skip_tokenizer_init:
                         output_ids.append(req.output_ids)
-                    else:
-                        output_ids = None
-                    if self.server_args.return_token_ids:
-                        origin_input_ids.append(req.origin_input_ids)
-                    else:
-                        origin_input_ids = None
                     skip_special_tokens.append(req.sampling_params.skip_special_tokens)
                     spaces_between_special_tokens.append(
                         req.sampling_params.spaces_between_special_tokens
@@ -1341,7 +1355,6 @@ class Scheduler:
                         decoded_texts,
                         decode_ids_list,
                         read_offsets,
-                        origin_input_ids,
                         output_ids,
                         skip_special_tokens,
                         spaces_between_special_tokens,
@@ -1513,8 +1526,9 @@ class Scheduler:
         return success, message
     def update_weights_from_distributed(
-        self, recv_req: UpdateWeightsFromDistributedReqInput
-    ):
+        self,
+        recv_req: UpdateWeightsFromDistributedReqInput,
+    ) -> Tuple[bool, str]:
         """Update the online model parameter."""
         success, message = self.tp_worker.update_weights_from_distributed(recv_req)
         if success:
@@ -1539,6 +1553,20 @@ class Scheduler:
         parameter = self.tp_worker.get_weights_by_name(recv_req)
         return parameter
+    def release_memory_occupation(self):
+        self.stashed_model_static_state = _export_static_state(
+            self.tp_worker.worker.model_runner.model
+        )
+        self.memory_saver_adapter.pause()
+        self.flush_cache()
+    def resume_memory_occupation(self):
+        self.memory_saver_adapter.resume()
+        _import_static_state(
+            self.tp_worker.worker.model_runner.model, self.stashed_model_static_state
+        )
+        del self.stashed_model_static_state
     def start_profile(self) -> None:
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
@@ -1577,6 +1605,20 @@ class Scheduler:
             del self.sessions[session_id]
+def _export_static_state(model):
+    return dict(
+        buffers=[
+            (name, buffer.detach().clone()) for name, buffer in model.named_buffers()
+        ]
+    )
+def _import_static_state(model, static_params):
+    self_named_buffers = dict(model.named_buffers())
+    for name, tensor in static_params["buffers"]:
+        self_named_buffers[name][...] = tensor
 def run_scheduler_process(
     server_args: ServerArgs,
     port_args: PortArgs,
@@ -1586,6 +1628,7 @@ def run_scheduler_process(
     pipe_writer,
 ):
     setproctitle.setproctitle("sglang::scheduler")
+    faulthandler.enable()
     # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
     if dp_rank is None and "SGLANG_DP_RANK" in os.environ:

sglang/srt/managers/session_controller.py CHANGED Viewed

@@ -99,7 +99,7 @@ class Session:
         if last_req is not None:
             # trim bos token if it is an append
-            if req.input_ids[0] == tokenizer.bos_token_id:
+            if tokenizer is not None and req.input_ids[0] == tokenizer.bos_token_id:
                 req.input_ids = req.input_ids[1:]
             input_ids = (

sglang 0.4.1.post4__py3-none-any.whl → 0.4.1.post6__py3-none-any.whl

sglang 0.4.1.post4py3-none-any.whl → 0.4.1.post6py3-none-any.whl