PyPI - sglang - Versions diffs - 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl - Mend

sglang 0.4.9.post5py3-none-any.whl → 0.4.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

sglang/bench_one_batch.py +3 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/model_config.py +6 -0
sglang/srt/configs/step3_vl.py +172 -0
sglang/srt/conversation.py +23 -0
sglang/srt/disaggregation/decode.py +2 -8
sglang/srt/disaggregation/prefill.py +2 -6
sglang/srt/distributed/parallel_state.py +86 -1
sglang/srt/entrypoints/engine.py +14 -18
sglang/srt/entrypoints/http_server.py +23 -3
sglang/srt/entrypoints/openai/protocol.py +3 -1
sglang/srt/entrypoints/openai/serving_base.py +5 -2
sglang/srt/entrypoints/openai/serving_chat.py +2 -21
sglang/srt/eplb/expert_distribution.py +5 -0
sglang/srt/eplb/expert_location.py +17 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -0
sglang/srt/eplb/expert_location_updater.py +2 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/step3_detector.py +436 -0
sglang/srt/hf_transformers_utils.py +2 -0
sglang/srt/jinja_template_utils.py +4 -1
sglang/srt/layers/moe/cutlass_moe.py +2 -1
sglang/srt/layers/moe/ep_moe/layer.py +98 -603
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
sglang/srt/layers/moe/topk.py +6 -2
sglang/srt/layers/quantization/fp8.py +0 -18
sglang/srt/layers/quantization/modelopt_quant.py +2 -0
sglang/srt/layers/quantization/unquant.py +0 -8
sglang/srt/layers/quantization/w4afp8.py +1 -0
sglang/srt/managers/cache_controller.py +143 -45
sglang/srt/managers/data_parallel_controller.py +6 -0
sglang/srt/managers/io_struct.py +12 -2
sglang/srt/managers/scheduler.py +116 -669
sglang/srt/managers/scheduler_input_blocker.py +106 -0
sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
sglang/srt/managers/template_manager.py +62 -19
sglang/srt/managers/tokenizer_manager.py +166 -83
sglang/srt/managers/tp_worker.py +9 -0
sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
sglang/srt/mem_cache/hicache_storage.py +45 -11
sglang/srt/mem_cache/hiradix_cache.py +15 -4
sglang/srt/mem_cache/memory_pool_host.py +73 -1
sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
sglang/srt/model_executor/model_runner.py +20 -13
sglang/srt/models/arcee.py +532 -0
sglang/srt/models/deepseek_v2.py +15 -56
sglang/srt/models/glm4_moe.py +3 -1
sglang/srt/models/granitemoe.py +3 -0
sglang/srt/models/grok.py +3 -0
sglang/srt/models/hunyuan.py +1 -0
sglang/srt/models/llama4.py +3 -0
sglang/srt/models/mixtral.py +3 -0
sglang/srt/models/olmoe.py +3 -0
sglang/srt/models/phimoe.py +1 -0
sglang/srt/models/qwen3_moe.py +12 -69
sglang/srt/models/step3_vl.py +994 -0
sglang/srt/multimodal/processors/base_processor.py +15 -16
sglang/srt/multimodal/processors/step3_vl.py +515 -0
sglang/srt/poll_based_barrier.py +31 -0
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +18 -13
sglang/srt/speculative/eagle_worker.py +2 -0
sglang/srt/two_batch_overlap.py +8 -3
sglang/test/test_utils.py +53 -0
sglang/utils.py +0 -11
sglang/version.py +1 -1
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -26,6 +26,11 @@ if TYPE_CHECKING:
     from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 from sglang.srt.mem_cache.hicache_storage import HiCacheFile, get_hash_str
+from sglang.srt.mem_cache.mooncake_store.mooncake_store import (
+    MooncakeStore,
+    get_hash_str_mooncake,
+)
+from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import HiCacheHF3FS
 logger = logging.getLogger(__name__)
@@ -124,7 +129,7 @@ class TransferBuffer:
     """
     def __init__(
-        self, stop_event, buffer_count: int = 3, max_buffer_size: int = 1000
+        self, stop_event, buffer_count: int = 3, max_buffer_size: int = 1024
     ) -> None:
         self.stop_event = stop_event
         self.buffers = Queue(maxsize=buffer_count)
@@ -250,17 +255,39 @@ class HiCacheController:
             self.tp_world_size = torch.distributed.get_world_size(group=tp_group)
             if self.tp_world_size > 1:
                 group_ranks = torch.distributed.get_process_group_ranks(tp_group)
-                self.tp_group = torch.distributed.new_group(group_ranks, backend="gloo")
+                self.prefetch_tp_group = torch.distributed.new_group(
+                    group_ranks, backend="gloo"
+                )
+                self.backup_tp_group = torch.distributed.new_group(
+                    group_ranks, backend="gloo"
+                )
             if storage_backend == "file":
                 self.storage_backend = HiCacheFile()
-                self.enable_storage = True
-                # todo: threshold policy for prefetching
-                self.prefetch_threshold = max(prefetch_threshold, self.page_size)
+                self.get_hash_str = get_hash_str
+            elif storage_backend == "mooncake":
+                self.storage_backend = MooncakeStore()
+                self.get_hash_str = get_hash_str_mooncake
+                self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
+            elif storage_backend == "hf3fs":
+                from sglang.srt.distributed import get_tensor_model_parallel_rank
+                rank = get_tensor_model_parallel_rank()
+                bytes_per_page = (
+                    mem_pool_host.get_size_per_token() * mem_pool_host.page_size
+                )
+                dtype = mem_pool_host.dtype
+                self.storage_backend = HiCacheHF3FS.from_env_config(
+                    rank, bytes_per_page, dtype
+                )
+                self.get_hash_str = get_hash_str
             else:
                 raise NotImplementedError(
                     f"Unsupported storage backend: {storage_backend}"
                 )
+            self.enable_storage = True
+            # todo: threshold policy for prefetching
+            self.prefetch_threshold = max(prefetch_threshold, self.page_size)
         self.load_cache_event = load_cache_event
         self.layer_done_counter = LayerDoneCounter(self.mem_pool_device.layer_num)
@@ -515,6 +542,37 @@ class HiCacheController:
         operation.mark_done()
         return operation.completed_tokens, operation.hash_value
+    def generic_page_transfer(self, operation, batch_size=8):
+        for i in range(0, len(operation.hash_value), batch_size):
+            page_hashes = operation.hash_value[i : i + batch_size]
+            page_data = self.storage_backend.batch_get(page_hashes)
+            if page_data is None:
+                logger.warning(
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {page_hashes}."
+                )
+                break
+            completed_tokens = operation.completed_tokens
+            if operation.increment(self.page_size * len(page_hashes)):
+                for i in range(len(page_hashes)):
+                    self.mem_pool_host.set_from_flat_data_page(
+                        operation.host_indices[completed_tokens],
+                        page_data[i],
+                    )
+                    completed_tokens += self.page_size
+            else:
+                # operation terminated by controller, release pre-allocated memory
+                self.mem_pool_host.free(
+                    operation.host_indices[operation.completed_tokens :]
+                )
+                break
+    def mooncake_page_transfer(self, operation):
+        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
+            operation.hash_value, operation.host_indices
+        )
+        self.storage_backend.batch_get(key_strs, buffer_ptrs, buffer_sizes)
+        operation.increment(len(operation.hash_value) * self.page_size)
     def prefetch_io_aux_func(self):
         """
         Auxiliary function conducting IO operations for prefetching.
@@ -522,24 +580,10 @@ class HiCacheController:
         while not self.stop_event.is_set():
             try:
                 operation = self.prefetch_buffer.get(block=True, timeout=1)
-                for h in operation.hash_value:
-                    page_data = self.storage_backend.get(h)
-                    if page_data is None:
-                        logger.warning(
-                            f"Prefetch operation {operation.request_id} failed to retrieve page {h}."
-                        )
-                        break
-                    if operation.increment(self.page_size):
-                        self.mem_pool_host.set_from_flat_data_page(
-                            operation.host_indices[operation.completed_tokens],
-                            page_data,
-                        )
-                    else:
-                        # operation terminated by controller, release pre-allocated memory
-                        self.mem_pool_host.free(
-                            operation.host_indices[operation.completed_tokens :]
-                        )
-                        break
+                if isinstance(self.storage_backend, MooncakeStore):
+                    self.mooncake_page_transfer(operation)
+                else:
+                    self.generic_page_transfer(operation)
             except Empty:
                 continue
@@ -563,18 +607,27 @@ class HiCacheController:
                 remaining_tokens = len(tokens_to_fetch)
                 hash_value = []
                 while remaining_tokens >= self.page_size:
-                    last_hash = get_hash_str(
+                    last_hash = self.get_hash_str(
                         tokens_to_fetch[
                             storage_hit_count : storage_hit_count + self.page_size
                         ],
                         last_hash,
                     )
-                    if self.storage_backend.exists(last_hash):
-                        storage_hit_count += self.page_size
-                        hash_value.append(last_hash)
-                        remaining_tokens -= self.page_size
-                    else:
-                        break
+                    # todo, more unified interface
+                    if not isinstance(self.storage_backend, MooncakeStore):
+                        if not self.storage_backend.exists(last_hash):
+                            break
+                    hash_value.append(last_hash)
+                    storage_hit_count += self.page_size
+                    remaining_tokens -= self.page_size
+                if isinstance(self.storage_backend, MooncakeStore):
+                    # deferring to batch exists for mooncake store
+                    exist_result = self.storage_backend.exists(hash_value)
+                    storage_hit_count = (
+                        sum(1 for v in exist_result.values() if v != 0) * self.page_size
+                    )
                 if self.tp_world_size > 1:
                     storage_hit_count_tensor = torch.tensor(
@@ -583,7 +636,7 @@ class HiCacheController:
                     torch.distributed.all_reduce(
                         storage_hit_count_tensor,
                         op=torch.distributed.ReduceOp.MIN,
-                        group=self.tp_group,
+                        group=self.prefetch_tp_group,
                     )
                     storage_hit_count = storage_hit_count_tensor.item()
@@ -622,6 +675,47 @@ class HiCacheController:
         self.backup_queue.put(operation)
         return operation.id
+    def generic_page_backup(self, operation, batch_size=8):
+        for i in range(0, len(operation.hash_value), batch_size):
+            page_hashes = operation.hash_value[i : i + batch_size]
+            page_data = [
+                self.mem_pool_host.get_flat_data_pages(
+                    operation.host_indices[j * self.page_size]
+                )
+                for j in range(i, i + len(page_hashes))
+            ]
+            success = self.storage_backend.batch_set(page_hashes, page_data)
+            if not success:
+                logger.warning(f"Failed to write page {page_hashes} to storage.")
+                break
+            operation.completed_tokens += self.page_size * len(page_hashes)
+    def mooncake_page_backup(self, operation):
+        if len(operation.hash_value):
+            exist_hashvalues = self.storage_backend.exists(operation.hash_value)
+            indices = operation.host_indices.tolist()
+            non_exist_keys = []
+            non_exist_indices = []
+            for i in range(len(operation.hash_value)):
+                if not exist_hashvalues[operation.hash_value[i]]:
+                    non_exist_keys.append(operation.hash_value[i])
+                    non_exist_indices.extend(
+                        indices[i * self.page_size : (i + 1) * self.page_size]
+                    )
+            if len(non_exist_keys) > 0:
+                key_strs, buffer_ptrs, buffer_sizes = (
+                    self.mem_pool_host.get_buffer_meta(
+                        non_exist_keys, non_exist_indices
+                    )
+                )
+                # TODO: check the return value of batch set to see how many tokens are set successfully
+                self.storage_backend.batch_set(
+                    key_strs,
+                    target_location=buffer_ptrs,
+                    target_sizes=buffer_sizes,
+                )
+        operation.completed_tokens += len(operation.hash_value) * self.page_size
     def backup_thread_func(self):
         """
         Manage backup operations from host memory to storage backend.
@@ -635,21 +729,25 @@ class HiCacheController:
                 last_hash = operation.last_hash
                 tokens_to_backup = operation.token_ids
-                for i in range(0, len(tokens_to_backup), self.page_size):
-                    last_hash = get_hash_str(
-                        tokens_to_backup[i : i + self.page_size], last_hash
-                    )
-                    success = self.storage_backend.set(
+                backup_hit_count = 0
+                remaining_tokens = len(tokens_to_backup)
+                hash_value = []
+                while remaining_tokens >= self.page_size:
+                    last_hash = self.get_hash_str(
+                        tokens_to_backup[
+                            backup_hit_count : backup_hit_count + self.page_size
+                        ],
                         last_hash,
-                        self.mem_pool_host.get_flat_data_page(
-                            operation.host_indices[i]
-                        ),
                     )
-                    if not success:
-                        logger.warning(f"Failed to write page {last_hash} to storage.")
-                        break
-                    operation.completed_tokens += self.page_size
-                    operation.hash_value.append(last_hash)
+                    backup_hit_count += self.page_size
+                    hash_value.append(last_hash)
+                    remaining_tokens -= self.page_size
+                operation.hash_value = hash_value
+                if isinstance(self.storage_backend, MooncakeStore):
+                    self.mooncake_page_backup(operation)
+                else:
+                    self.generic_page_backup(operation)
                 min_completed_tokens = operation.completed_tokens
                 if self.tp_world_size > 1:
@@ -659,7 +757,7 @@ class HiCacheController:
                     torch.distributed.all_reduce(
                         completed_tokens_tensor,
                         op=torch.distributed.ReduceOp.MIN,
-                        group=self.tp_group,
+                        group=self.backup_tp_group,
                     )
                     min_completed_tokens = completed_tokens_tensor.item()

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -26,6 +26,7 @@ import zmq
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.managers.io_struct import (
+    BlockReqInput,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
 )
@@ -221,6 +222,7 @@ class DataParallelController:
                     + ((pp_rank % pp_size_per_node) * tp_size_per_node)
                     + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
                 )
+                moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
                 proc = mp.Process(
                     target=run_scheduler_process,
                     args=(
@@ -228,6 +230,7 @@ class DataParallelController:
                         rank_port_args,
                         gpu_id,
                         tp_rank,
+                        moe_ep_rank,
                         pp_rank,
                         dp_rank,
                         writer,
@@ -282,6 +285,9 @@ class DataParallelController:
                     ),
                 ):
                     self.dispatching(recv_req)
+                elif isinstance(recv_req, BlockReqInput):
+                    for worker in self.workers:
+                        worker.send_pyobj(recv_req)
                 else:
                     # Send other control messages to first worker of tp group
                     for worker in self.workers[:: self.control_message_step]:

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -152,8 +152,6 @@ class GenerateReqInput:
         else:
             self._normalize_batch_inputs()
-        self._validate_session_params()
     def _validate_inputs(self):
         """Validate that the input configuration is valid."""
         if (
@@ -911,6 +909,8 @@ class AbortReq:
     rid: str = ""
     # Whether to abort all requests
     abort_all: bool = False
+    # The finished reason data
+    finished_reason: Optional[Dict[str, Any]] = None
 @dataclass
@@ -1101,3 +1101,13 @@ class LoRAUpdateResult:
 LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = LoRAUpdateResult
+class BlockReqType(Enum):
+    BLOCK = 1
+    UNBLOCK = 2
+@dataclass
+class BlockReqInput:
+    type: BlockReqType

sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl

sglang 0.4.9.post5py3-none-any.whl → 0.4.10py3-none-any.whl