PyPI - sglang - Versions diffs - 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl - Mend

sglang 0.4.9.post6py3-none-any.whl → 0.4.10.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/bench_offline_throughput.py +20 -0
sglang/bench_one_batch.py +3 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/model_config.py +4 -0
sglang/srt/configs/step3_vl.py +172 -0
sglang/srt/conversation.py +23 -0
sglang/srt/disaggregation/decode.py +2 -8
sglang/srt/disaggregation/launch_lb.py +5 -20
sglang/srt/disaggregation/mooncake/conn.py +33 -15
sglang/srt/disaggregation/prefill.py +2 -6
sglang/srt/distributed/parallel_state.py +86 -1
sglang/srt/entrypoints/engine.py +14 -18
sglang/srt/entrypoints/http_server.py +10 -2
sglang/srt/entrypoints/openai/serving_chat.py +2 -21
sglang/srt/eplb/expert_distribution.py +5 -0
sglang/srt/eplb/expert_location.py +17 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -0
sglang/srt/eplb/expert_location_updater.py +2 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/step3_detector.py +436 -0
sglang/srt/hf_transformers_utils.py +2 -0
sglang/srt/jinja_template_utils.py +4 -1
sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
sglang/srt/layers/attention/utils.py +6 -1
sglang/srt/layers/moe/cutlass_moe.py +2 -1
sglang/srt/layers/moe/ep_moe/layer.py +39 -674
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
sglang/srt/layers/moe/fused_moe_triton/layer.py +152 -39
sglang/srt/layers/quantization/fp8.py +52 -18
sglang/srt/layers/quantization/unquant.py +0 -8
sglang/srt/layers/quantization/w4afp8.py +1 -0
sglang/srt/layers/quantization/w8a8_int8.py +4 -1
sglang/srt/managers/cache_controller.py +165 -67
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/io_struct.py +0 -2
sglang/srt/managers/scheduler.py +90 -671
sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
sglang/srt/managers/template_manager.py +62 -19
sglang/srt/managers/tokenizer_manager.py +123 -74
sglang/srt/managers/tp_worker.py +4 -0
sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
sglang/srt/mem_cache/hicache_storage.py +60 -17
sglang/srt/mem_cache/hiradix_cache.py +36 -8
sglang/srt/mem_cache/memory_pool.py +15 -118
sglang/srt/mem_cache/memory_pool_host.py +418 -29
sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +183 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
sglang/srt/model_executor/cuda_graph_runner.py +25 -1
sglang/srt/model_executor/model_runner.py +13 -1
sglang/srt/model_loader/weight_utils.py +2 -0
sglang/srt/models/arcee.py +532 -0
sglang/srt/models/deepseek_v2.py +7 -6
sglang/srt/models/glm4_moe.py +6 -4
sglang/srt/models/granitemoe.py +3 -0
sglang/srt/models/grok.py +3 -0
sglang/srt/models/hunyuan.py +1 -0
sglang/srt/models/llama4.py +3 -0
sglang/srt/models/mixtral.py +3 -0
sglang/srt/models/olmoe.py +3 -0
sglang/srt/models/phimoe.py +1 -0
sglang/srt/models/step3_vl.py +991 -0
sglang/srt/multimodal/processors/base_processor.py +15 -16
sglang/srt/multimodal/processors/step3_vl.py +515 -0
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +49 -18
sglang/srt/speculative/eagle_worker.py +2 -0
sglang/srt/utils.py +1 -0
sglang/test/attention/test_trtllm_mla_backend.py +945 -0
sglang/utils.py +0 -11
sglang/version.py +1 -1
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +3 -4
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +83 -65
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py ADDED Viewed

@@ -0,0 +1,278 @@
+import atexit
+import concurrent.futures
+import json
+import logging
+import os
+import signal
+import threading
+from collections import OrderedDict
+from functools import wraps
+from typing import List, Optional
+import torch
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorage
+from sglang.srt.mem_cache.storage.hf3fs.client_hf3fs import Hf3fsClient
+logger = logging.getLogger(__name__)
+class AtomicCounter:
+    def __init__(self, n: int):
+        assert n > 0
+        self.n = n
+        self._value = 0
+        self._lock = threading.Lock()
+    def next(self) -> int:
+        with self._lock:
+            current = self._value
+            self._value = (current + 1) % self.n
+            return current
+def synchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.lock:
+                return func(self, *args, **kwargs)
+        return wrapper
+    return _decorator
+class HiCacheHF3FS(HiCacheStorage):
+    default_env_var: str = "SGLANG_HICACHE_HF3FS_CONFIG_PATH"
+    def __init__(
+        self,
+        file_path: str,
+        file_size: int,
+        numjobs: int,
+        bytes_per_page: int,
+        entries: int,
+        dtype: torch.dtype,
+    ):
+        self.file_path = file_path
+        self.file_size = file_size
+        self.numjobs = numjobs
+        self.bytes_per_page = bytes_per_page
+        self.entries = entries
+        self.dtype = dtype
+        self.numel = self.bytes_per_page // self.dtype.itemsize
+        self.num_pages = self.file_size // self.bytes_per_page
+        logger.info(
+            "HiCacheHF3FS "
+            f"file_path = {self.file_path}, "
+            f"file_size = {self.file_size/(2**30):.2f} GB, "
+            f"numjobs = {self.numjobs}, "
+            f"bytes_per_page = {self.bytes_per_page/(2**20):.2f} MB, "
+            f"entries = {self.entries}, "
+            f"num_pages = {self.num_pages}"
+        )
+        self.ac = AtomicCounter(self.numjobs)
+        self.clients = [
+            Hf3fsClient(
+                self.file_path, self.file_size, self.bytes_per_page, self.entries
+            )
+            for _ in range(numjobs)
+        ]
+        self.executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.numjobs, thread_name_prefix="HiCacheHF3FS"
+        )
+        # Implemented a preliminary single-file page_hash -> file_offset index as interim storage.
+        # Future iterations may adopt a global KVCache manager to coordinate external cache instances
+        # through centralized metadata orchestration.
+        self.lock = threading.RLock()
+        self.free_pages = list(range(self.num_pages))
+        self.key_to_index = OrderedDict()
+        atexit.register(self.close)
+        signal.signal(signal.SIGINT, lambda sig, frame: self.close())
+        signal.signal(signal.SIGTERM, lambda sig, frame: self.close())
+        signal.signal(signal.SIGQUIT, lambda sig, frame: self.close())
+    @staticmethod
+    def from_env_config(
+        rank: int, bytes_per_page: int, dtype: torch.dtype
+    ) -> "HiCacheHF3FS":
+        config_path = os.getenv(HiCacheHF3FS.default_env_var)
+        if not config_path:
+            return HiCacheHF3FS(
+                file_path=f"/data/hicache.{rank}.bin",
+                file_size=1 << 40,
+                numjobs=16,
+                bytes_per_page=bytes_per_page,
+                entries=8,
+                dtype=dtype,
+            )
+        try:
+            with open(config_path, "r") as f:
+                config = json.load(f)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load config from {config_path}: {str(e)}")
+        required_keys = {
+            "file_path_prefix",
+            "file_size",
+            "numjobs",
+            "entries",
+        }
+        missing_keys = required_keys - set(config.keys())
+        if missing_keys:
+            raise ValueError(f"Missing required keys in config: {missing_keys}")
+        return HiCacheHF3FS(
+            file_path=f"{config['file_path_prefix']}.{rank}.bin",
+            file_size=int(config["file_size"]),
+            numjobs=int(config["numjobs"]),
+            bytes_per_page=bytes_per_page,
+            entries=int(config["entries"]),
+            dtype=dtype,
+        )
+    def get(
+        self, key: str, target_location: Optional[torch.Tensor] = None
+    ) -> torch.Tensor | None:
+        return self.batch_get([key], target_location)[0]
+    @synchronized()
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[List[torch.Tensor]] = None,
+    ) -> List[torch.Tensor | None]:
+        batch_indices, file_offsets = [], []
+        for i, key in enumerate(keys):
+            if key not in self.key_to_index:
+                continue
+            batch_indices.append(i)
+            file_offsets.append(self.key_to_index[key] * self.bytes_per_page)
+            self.key_to_index.move_to_end(key)
+        # TODO: target_locations
+        file_results = [
+            torch.empty(self.numel, dtype=self.dtype) for _ in range(len(batch_indices))
+        ]
+        futures = [
+            self.executor.submit(
+                self.clients[self.ac.next()].batch_read,
+                file_offsets[i : i + self.entries],
+                file_results[i : i + self.entries],
+            )
+            for i in range(0, len(batch_indices), self.entries)
+        ]
+        read_results = [result for future in futures for result in future.result()]
+        results = [None] * len(keys)
+        for batch_index, file_result, read_result in zip(
+            batch_indices, file_results, read_results
+        ):
+            if read_result == self.bytes_per_page:
+                results[batch_index] = file_result
+            else:
+                logger.error(f"HiCacheHF3FS get {keys[batch_index]} failed")
+        return results
+    def set(self, key: str, value: torch.Tensor) -> bool:
+        return self.batch_set([key], [value])
+    def batch_set(self, keys: List[str], values: List[torch.Tensor]) -> bool:
+        indices = self.get_batch_set_indices(keys)
+        batch_indices, file_offsets, file_values = [], [], []
+        for i, (value, (is_written, index)) in enumerate(zip(values, indices)):
+            if is_written or index == -1:
+                continue
+            batch_indices.append(i)
+            file_offsets.append(index * self.bytes_per_page)
+            file_values.append(value.contiguous())
+        futures = [
+            self.executor.submit(
+                self.clients[self.ac.next()].batch_write,
+                file_offsets[i : i + self.entries],
+                file_values[i : i + self.entries],
+            )
+            for i in range(0, len(batch_indices), self.entries)
+        ]
+        write_results = [
+            result == self.bytes_per_page
+            for future in futures
+            for result in future.result()
+        ]
+        results = [index[0] for index in indices]
+        for batch_index, write_result in zip(batch_indices, write_results):
+            key = keys[batch_index]
+            index = indices[batch_index][1]
+            if write_result:
+                self.key_to_index[key] = index
+                self.key_to_index.move_to_end(key)
+            else:
+                logger.error(f"HiCacheHF3FS set {key} failed")
+                self.free_pages.append(index)
+            results[batch_index] = write_result
+        return all(results)
+    @synchronized()
+    def get_batch_set_indices(self, keys: List[str]) -> list:
+        ionum = len(keys)
+        # results: tuples of (is_written: bool, page_idx: int)
+        # - is_written: True = hit (no I/O), False = write (miss)
+        # - page_idx: page storing data
+        results = [None] * min(ionum, self.num_pages)
+        if ionum > self.num_pages:
+            results.extend([(False, -1)] * (ionum - self.num_pages))
+        new_keys = []
+        for batch_index, key in enumerate(keys[: self.num_pages]):
+            if key in self.key_to_index:
+                results[batch_index] = (True, self.key_to_index[key])
+                self.key_to_index.move_to_end(key)
+            else:
+                new_keys.append((batch_index, key))
+        for batch_index, _ in new_keys:
+            index = (
+                self.free_pages.pop()
+                if len(self.free_pages) > 0
+                else self.key_to_index.popitem(last=False)[1]
+            )
+            results[batch_index] = (False, index)
+        return results
+    @synchronized()
+    def delete(self, key: str) -> None:
+        if key not in self.key_to_index:
+            return
+        index = self.key_to_index.pop(key)
+        self.free_pages.append(index)
+    @synchronized()
+    def exists(self, key: str) -> bool:
+        return key in self.key_to_index
+    @synchronized()
+    def clear(self) -> None:
+        self.free_pages = list(range(self.num_pages))
+        self.key_to_index.clear()
+    def close(self) -> None:
+        try:
+            for c in self.clients:
+                c.close()
+            self.executor.shutdown(wait=True)
+        except Exception as e:
+            logger.error(f"close HiCacheHF3FS: {e}")
+        logger.info("close HiCacheHF3FS")

sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py ADDED Viewed

@@ -0,0 +1,43 @@
+import multiprocessing.shared_memory
+from pathlib import Path
+import pytest
+import torch
+from torch.utils.cpp_extension import load
+from tqdm import tqdm
+root = Path(__file__).parent.resolve()
+hf3fs_utils = load(
+    name="hf3fs_utils", sources=[f"{root}/hf3fs_utils.cpp"], verbose=True
+)
+def test_rw_shm():
+    numel = 8 << 20
+    dtype = torch.bfloat16
+    page_num = 128
+    page_bytes = numel * dtype.itemsize
+    shm = multiprocessing.shared_memory.SharedMemory(
+        size=page_num * page_bytes, create=True
+    )
+    tshm = torch.frombuffer(shm.buf, dtype=torch.uint8)
+    a = [
+        torch.randn(numel, dtype=dtype)
+        for _ in tqdm(range(page_num), desc="prepare input")
+    ]
+    b = [
+        torch.empty(numel, dtype=dtype)
+        for _ in tqdm(range(page_num), desc="prepare output")
+    ]
+    hf3fs_utils.write_shm(a, tshm)
+    hf3fs_utils.read_shm(tshm, b)
+    for _a, _b in tqdm(zip(a, b), desc="assert_close"):
+        torch.testing.assert_close(_a, _b)
+    del tshm
+    shm.close()
+    shm.unlink()
+if __name__ == "__main__":
+    pytest.main([__file__])

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -16,6 +16,7 @@
 from __future__ import annotations
 import bisect
+import gc
 import inspect
 import logging
 import os
@@ -75,6 +76,24 @@ def model_capture_mode():
     is_capture_mode = False
+@contextmanager
+def freeze_gc(enable_cudagraph_gc: bool):
+    """
+    Optimize garbage collection during CUDA graph capture.
+    Clean up, then freeze all remaining objects from being included
+    in future collections if GC is disabled during capture.
+    """
+    gc.collect()
+    should_freeze = not enable_cudagraph_gc
+    if should_freeze:
+        gc.freeze()
+    try:
+        yield
+    finally:
+        if should_freeze:
+            gc.unfreeze()
 def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
     for sub in model._modules.values():
         if isinstance(sub, CustomOp):
@@ -423,7 +442,12 @@ class CudaGraphRunner:
                 record_shapes=True,
             )
-        with graph_capture() as graph_capture_context:
+        # Trigger CUDA graph capture for specific shapes.
+        # Capture the large shapes first so that the smaller shapes
+        # can reuse the memory pool allocated for the large shapes.
+        with freeze_gc(
+            self.model_runner.server_args.enable_cudagraph_gc
+        ), graph_capture() as graph_capture_context:
             with profile_context as prof:
                 self.stream = graph_capture_context.stream
                 avail_mem = get_available_gpu_memory(

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -157,6 +157,8 @@ class ModelRunner:
         gpu_id: int,
         tp_rank: int,
         tp_size: int,
+        moe_ep_rank: int,
+        moe_ep_size: int,
         pp_rank: int,
         pp_size: int,
         nccl_port: int,
@@ -175,6 +177,8 @@ class ModelRunner:
             logger.addFilter(RankZeroFilter(tp_rank == 0))
         self.tp_rank = tp_rank
         self.tp_size = tp_size
+        self.moe_ep_rank = moe_ep_rank
+        self.moe_ep_size = moe_ep_size
         self.dp_size = server_args.dp_size
         self.pp_rank = pp_rank
         self.pp_size = pp_size
@@ -432,6 +436,7 @@ class ModelRunner:
                     "triton",
                     "flashmla",
                     "cutlass_mla",
+                    "trtllm_mla",
                     "ascend",
                 ]:
                     logger.info(
@@ -549,6 +554,7 @@ class ModelRunner:
             initialize_model_parallel(
                 tensor_model_parallel_size=self.tp_size,
                 pipeline_model_parallel_size=self.pp_size,
+                expert_model_parallel_size=self.moe_ep_size,
                 duplicate_tp_group=self.server_args.enable_pdmux,
             )
             initialize_dp_attention(
@@ -666,7 +672,7 @@ class ModelRunner:
             self.sliding_window_size = self.model.get_attention_sliding_window_size()
         elif self.model_config.attention_chunk_size is not None:
             self.sliding_window_size = self.model_config.attention_chunk_size
-            print(
+            logger.info(
                 f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}"
             )
@@ -1432,6 +1438,12 @@ class ModelRunner:
             )
             return CutlassMLABackend(self)
+        elif self.server_args.attention_backend == "trtllm_mla":
+            if not self.use_mla_backend:
+                raise ValueError("trtllm_mla backend can only be used with MLA models.")
+            from sglang.srt.layers.attention.trtllm_mla_backend import TRTLLMMLABackend
+            return TRTLLMMLABackend(self)
         elif self.server_args.attention_backend == "intel_amx":
             from sglang.srt.layers.attention.intel_amx_backend import (
                 IntelAMXAttnBackend,

sglang/srt/model_loader/weight_utils.py CHANGED Viewed

@@ -229,6 +229,8 @@ def get_quant_config(
                     f"Unsupported quantization config"
                     f" found for {model_config.quantization} in {f}."
                 )
+        elif model_config.quantization == "w8a8_int8":
+            config["packed_modules_mapping"] = packed_modules_mapping
     return quant_cls.from_config(config)

sglang 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl

sglang 0.4.9.post6py3-none-any.whl → 0.4.10.post1py3-none-any.whl