PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (152) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_one_batch.py +8 -6
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/configs/model_config.py +6 -0
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +19 -3
sglang/srt/custom_op.py +5 -1
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +211 -72
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +30 -29
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +144 -55
sglang/srt/disaggregation/utils.py +155 -123
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +37 -29
sglang/srt/entrypoints/http_server.py +153 -72
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +921 -0
sglang/srt/entrypoints/openai/serving_completions.py +424 -0
sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +40 -3
sglang/srt/layers/attention/aiter_backend.py +20 -4
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
sglang/srt/layers/attention/flashattention_backend.py +71 -72
sglang/srt/layers/attention/flashinfer_backend.py +10 -8
sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
sglang/srt/layers/attention/flashmla_backend.py +7 -12
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +138 -130
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +28 -10
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +29 -2
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +2 -14
sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
sglang/srt/layers/moe/ep_moe/layer.py +249 -33
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
sglang/srt/layers/moe/topk.py +107 -12
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/layers/rotary_embedding.py +42 -2
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora_manager.py +249 -105
sglang/srt/lora/mem_pool.py +53 -50
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -14
sglang/srt/managers/io_struct.py +31 -10
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +79 -37
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +220 -79
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +40 -10
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -15
sglang/srt/mem_cache/hiradix_cache.py +38 -25
sglang/srt/mem_cache/memory_pool.py +213 -505
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +56 -28
sglang/srt/model_executor/cuda_graph_runner.py +198 -100
sglang/srt/model_executor/forward_batch_info.py +32 -10
sglang/srt/model_executor/model_runner.py +28 -12
sglang/srt/model_loader/loader.py +16 -2
sglang/srt/model_loader/weight_utils.py +11 -2
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_nextn.py +29 -27
sglang/srt/models/deepseek_v2.py +213 -173
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +351 -238
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
sglang/srt/speculative/eagle_utils.py +468 -116
sglang/srt/speculative/eagle_worker.py +258 -84
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +235 -11
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +2 -0
sglang/test/test_utils.py +4 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -1990
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -15,6 +15,7 @@ from sglang.api import (
     get_server_info,
     image,
     select,
+    separate_reasoning,
     set_default_backend,
     system,
     system_begin,
@@ -54,6 +55,7 @@ __all__ = [
     "get_server_info",
     "image",
     "select",
+    "separate_reasoning",
     "set_default_backend",
     "system",
     "system_begin",

sglang/api.py CHANGED Viewed

@@ -15,6 +15,7 @@ from sglang.lang.ir import (
     SglRoleBegin,
     SglRoleEnd,
     SglSelect,
+    SglSeparateReasoning,
     SglVideo,
 )
@@ -277,3 +278,9 @@ def assistant_begin():
 def assistant_end():
     return SglRoleEnd("assistant")
+def separate_reasoning(
+    expr: Optional[SglExpr] = None, model_type: Optional[str] = None
+):
+    return SglExprList([expr, SglSeparateReasoning(model_type, expr=expr)])

sglang/bench_one_batch.py CHANGED Viewed

@@ -71,6 +71,8 @@ from sglang.srt.utils import (
     configure_logger,
     get_bool_env_var,
     kill_process_tree,
+    require_mlp_sync,
+    require_mlp_tp_gather,
     set_gpu_proc_affinity,
     suppress_other_loggers,
 )
@@ -243,7 +245,7 @@ def extend(reqs, model_runner):
         enable_custom_logit_processor=False,
     )
     batch.prepare_for_extend()
-    _maybe_prepare_dp_attn_batch(batch, model_runner)
+    _maybe_prepare_mlp_sync_batch(batch, model_runner)
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output, _ = model_runner.forward(forward_batch)
@@ -255,7 +257,7 @@ def extend(reqs, model_runner):
 def decode(input_token_ids, batch, model_runner):
     batch.output_ids = input_token_ids
     batch.prepare_for_decode()
-    _maybe_prepare_dp_attn_batch(batch, model_runner)
+    _maybe_prepare_mlp_sync_batch(batch, model_runner)
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output, _ = model_runner.forward(forward_batch)
@@ -263,18 +265,18 @@ def decode(input_token_ids, batch, model_runner):
     return next_token_ids, logits_output.next_token_logits
-def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
-    if model_runner.server_args.enable_dp_attention:
-        Scheduler.prepare_dp_attn_batch_raw(
+def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
+    if require_mlp_sync(model_runner.server_args):
+        Scheduler.prepare_mlp_sync_batch_raw(
             batch,
             dp_size=model_runner.server_args.dp_size,
             attn_tp_size=1,
-            moe_dense_tp_size=model_runner.server_args.moe_dense_tp_size,
             tp_cpu_group=model_runner.tp_group.cpu_group,
             get_idle_batch=None,
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
             spec_algorithm=SpeculativeAlgorithm.NONE,
             speculative_num_draft_tokens=None,
+            require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
         )

sglang/bench_serving.py CHANGED Viewed

@@ -399,7 +399,7 @@ async def async_request_sglang_generate(
                             # NOTE: Some completion API might have a last
                             # usage summary response without a token so we
                             # want to check a token was generated
-                            if data["text"]:
+                            if "text" in data and data["text"]:
                                 timestamp = time.perf_counter()
                                 generated_text = data["text"]
                                 output_len = data["meta_info"]["completion_tokens"]

sglang/lang/interpreter.py CHANGED Viewed

@@ -26,6 +26,7 @@ from sglang.lang.ir import (
     SglRoleBegin,
     SglRoleEnd,
     SglSelect,
+    SglSeparateReasoning,
     SglVariable,
     SglVarScopeBegin,
     SglVarScopeEnd,
@@ -472,6 +473,8 @@ class StreamExecutor:
                 self._execute_concatenate_and_append_kv_cache(other)
             else:
                 self._execute_concatenate_and_append_text(other)
+        elif isinstance(other, SglSeparateReasoning):
+            self._execute_separate_reasoning(other)
         else:
             raise ValueError(f"Unknown type: {type(other)}")
@@ -724,8 +727,44 @@ class StreamExecutor:
         src_rids = [state.stream_executor.sid for state in expr.states]
         self.backend.concatenate_and_append(src_rids, self.sid)
+    def _execute_separate_reasoning(self, expr: SglSeparateReasoning):
+        if self.stream:
+            # separate reasoning for stream is not supported
+            return
+        if (
+            self.cur_role == "assistant"
+            and self.num_api_spec_tokens is not None
+            and self.backend.is_chat_model
+        ):
+            # Execute the stored lazy generation calls
+            self.backend.role_end_generate(self)
+        from sglang.srt.reasoning_parser import ReasoningParser
+        reasoning_parser = ReasoningParser(expr.model_type)
+        other = expr.expr
+        if not other:
+            return
+        elif isinstance(other, SglGen) or isinstance(other, SglSelect):
+            cur_text = self.get_var(other.name)
+            reasoning, normal_text = reasoning_parser.parse_non_stream(cur_text)
+            reasoning_name = expr.process_name_for_reasoning(other.name)
+            self.set_var(other.name, normal_text)
+            self.set_var(reasoning_name, reasoning)
+            # the variable is ready to be used
+            self.variable_event[reasoning_name].set()
+            self.text_ = self.text_[: self.cur_role_begin_pos] + normal_text
+        elif isinstance(other, SglExprList):
+            for x in other.expr_list:
+                self._execute_separate_reasoning(
+                    SglSeparateReasoning(expr.model_type, x)
+                )
     def _init_var_event(self, expr):
-        if isinstance(expr, (SglGen, SglSelect, SglVarScopeBegin)):
+        if isinstance(
+            expr, (SglGen, SglSelect, SglVarScopeBegin, SglSeparateReasoning)
+        ):
             self.variable_event[expr.name] = threading.Event()
             if self.stream:
                 self.stream_var_event[expr.name] = threading.Event()

sglang/lang/ir.py CHANGED Viewed

@@ -606,3 +606,30 @@ class SglCommitLazy(SglExpr):
     def __repr__(self):
         return "CommitLazy()"
+class SglSeparateReasoning(SglExpr):
+    def __init__(self, model_type: str, expr: SglExpr):
+        super().__init__()
+        self.model_type = model_type
+        self.expr = expr
+        self.name = None
+        self._process_expr(expr)
+    def process_name_for_reasoning(self, name):
+        if not name:
+            raise ValueError("name must be provided")
+        return f"{name}_reasoning_content"
+    def _process_expr(self, expr):
+        if isinstance(expr, SglGen):
+            self.name = self.process_name_for_reasoning(expr.name)
+        elif isinstance(expr, SglSelect):
+            self.name = self.process_name_for_reasoning(expr.name)
+        elif isinstance(expr, SglExprList):
+            for x in expr.expr_list:
+                self._process_expr(x)
+    def __repr__(self):
+        return f"SeparateReasoning(model_type={self.model_type}, name={self.name})"

sglang/math_utils.py ADDED Viewed

@@ -0,0 +1,8 @@
+# COPIED FROM DeepGEMM
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+# COPIED FROM DeepGEMM
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y

sglang/srt/_custom_ops.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List, Tuple
 import torch
-from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
+from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu, is_npu
 logger = logging.getLogger(__name__)
 use_vllm_custom_allreduce = get_bool_env_var(
@@ -25,7 +25,7 @@ if not is_hpu():
             logger.warning("Failed to import from custom_ar with %r", e)
-if not is_hip():
+if not is_hip() and not is_npu():
     if use_vllm_custom_allreduce:
         custom_op = torch.ops._C_custom_ar
     else:

sglang/srt/code_completion_parser.py CHANGED Viewed

@@ -15,12 +15,10 @@
 import dataclasses
-import json
 import logging
-import os
 from enum import auto
-from sglang.srt.openai_api.protocol import ChatCompletionRequest
+from sglang.srt.entrypoints.openai.protocol import CompletionRequest
 logger = logging.getLogger(__name__)
 completion_template_name = None
@@ -57,46 +55,6 @@ class CompletionTemplate:
 completion_templates: dict[str, CompletionTemplate] = {}
-def load_completion_template_for_openai_api(completion_template_arg):
-    global completion_template_name
-    logger.info(
-        f"Use completion template for the OpenAI-compatible API server: {completion_template_arg}"
-    )
-    if not completion_template_exists(completion_template_arg):
-        if not os.path.exists(completion_template_arg):
-            raise RuntimeError(
-                f"Completion template {completion_template_arg} is not a built-in template name "
-                "or a valid completion template file path."
-            )
-        assert completion_template_arg.endswith(
-            ".json"
-        ), "unrecognized format of completion template file"
-        with open(completion_template_arg, "r") as filep:
-            template = json.load(filep)
-            try:
-                fim_position = FimPosition[template["fim_position"]]
-            except KeyError:
-                raise ValueError(
-                    f"Unknown fim position: {template['fim_position']}"
-                ) from None
-            register_completion_template(
-                CompletionTemplate(
-                    name=template["name"],
-                    fim_begin_token=template["fim_begin_token"],
-                    fim_middle_token=template["fim_middle_token"],
-                    fim_end_token=template["fim_end_token"],
-                    fim_position=fim_position,
-                ),
-                override=True,
-            )
-        completion_template_name = template["name"]
-    else:
-        completion_template_name = completion_template_arg
 def register_completion_template(template: CompletionTemplate, override: bool = False):
     """Register a new completion template."""
     if not override:
@@ -116,7 +74,7 @@ def is_completion_template_defined() -> bool:
     return completion_template_name is not None
-def generate_completion_prompt_from_request(request: ChatCompletionRequest) -> str:
+def generate_completion_prompt_from_request(request: CompletionRequest) -> str:
     global completion_template_name
     if request.suffix == "":
         return request.prompt

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -550,6 +550,11 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
         or "Qwen2ForRewardModel" in model_architectures
         or "Qwen2ForSequenceClassification" in model_architectures
         or "CLIPModel" in model_architectures
+        or "BertModel" in model_architectures
+        or "Contriever" in model_architectures
+        or "BertForSequenceClassification" in model_architectures
+        or "XLMRobertaModel" in model_architectures
+        or "XLMRobertaForSequenceClassification" in model_architectures
     ):
         return False
     else:
@@ -578,6 +583,7 @@ multimodal_model_archs = [
     "KimiVLForConditionalGeneration",
     "InternVLChatModel",
     "Phi4MMForCausalLM",
+    "VILAForConditionalGeneration",
 ]

sglang/srt/constants.py ADDED Viewed

@@ -0,0 +1,3 @@
+# GPU Memory Types
+GPU_MEMORY_TYPE_KV_CACHE = "kv_cache"
+GPU_MEMORY_TYPE_WEIGHTS = "weights"

sglang/srt/conversation.py CHANGED Viewed

@@ -11,7 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Conversation chat templates."""
+"""Conversation chat templates.
+This module provides conversation template definitions, data structures, and utilities
+for managing chat templates across different model types in SGLang.
+Key components:
+- Conversation class: Defines the structure and behavior of chat templates
+- SeparatorStyle enum: Different conversation formatting styles
+- Template registry: Functions to register and retrieve templates by name or model path
+- Built-in templates: Pre-defined templates for popular models
+"""
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
@@ -20,7 +30,7 @@ import re
 from enum import IntEnum, auto
 from typing import Callable, Dict, List, Optional, Tuple, Union
-from sglang.srt.openai_api.protocol import ChatCompletionRequest
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
 from sglang.srt.utils import read_system_prompt_from_file
@@ -618,7 +628,7 @@ def generate_chat_conv(
 # llama2 template
-# reference: https://huggingface.co/blog/codellama#conversational-instructions
+# reference: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 # reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
 register_conv_template(
     Conversation(
@@ -983,3 +993,9 @@ def match_devstral(model_path: str):
 def match_phi_4_mm(model_path: str):
     if "phi-4-multimodal" in model_path.lower():
         return "phi-4-mm"
+@register_conv_template_matching_function
+def match_vila(model_path: str):
+    if re.search(r"vila", model_path, re.IGNORECASE):
+        return "chatml"

sglang/srt/custom_op.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from torch import nn
-from sglang.srt.utils import is_cuda, is_hip
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_is_cpu = is_cpu()
+_is_cpu_amx_available = cpu_has_amx_support()
 class CustomOp(nn.Module):
@@ -75,5 +77,7 @@ class CustomOp(nn.Module):
             return self.forward_cuda
         elif _is_hip:
             return self.forward_hip
+        elif _is_cpu and _is_cpu_amx_available:
+            return self.forward_cpu
         else:
             return self.forward_native

sglang/srt/disaggregation/base/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .conn import (
+from sglang.srt.disaggregation.base.conn import (
     BaseKVBootstrapServer,
     BaseKVManager,
     BaseKVReceiver,

sglang/srt/disaggregation/base/conn.py CHANGED Viewed

@@ -1,23 +1,32 @@
+from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import TYPE_CHECKING, List, Optional
 import numpy as np
 import numpy.typing as npt
-from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.server_args import ServerArgs
+if TYPE_CHECKING:
+    from sglang.srt.disaggregation.utils import DisaggregationMode
 class KVArgs:
     engine_rank: int
-    kv_data_ptrs: list[int]
-    kv_data_lens: list[int]
-    kv_item_lens: list[int]
-    aux_data_ptrs: list[int]
-    aux_data_lens: list[int]
-    aux_item_lens: list[int]
+    kv_data_ptrs: List[int]
+    kv_data_lens: List[int]
+    kv_item_lens: List[int]
+    aux_data_ptrs: List[int]
+    aux_data_lens: List[int]
+    aux_item_lens: List[int]
     ib_device: str
+    ib_traffic_class: str
     gpu_id: int
+    # for different tp
+    decode_tp_size: int
+    # for pp prefill
+    prefill_pp_size: int
 class KVPoll:
@@ -45,7 +54,12 @@ class BaseKVSender(ABC):
     @abstractmethod
     def __init__(
-        self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: int,
+        dest_tp_ranks: List[int],
+        pp_rank: int,
     ): ...
     @abstractmethod
@@ -56,7 +70,7 @@ class BaseKVSender(ABC):
         ...
     @abstractmethod
-    def send(self, kv_indices: npt.NDArray[np.int64]):
+    def send(self, kv_indices: npt.NDArray[np.int32]):
         """
         Send the kv cache at the given kv indices to the decoder server
         """
@@ -88,7 +102,7 @@ class BaseKVReceiver(ABC):
     ): ...
     @abstractmethod
-    def init(self, kv_indices: npt.NDArray[np.int64], aux_index: Optional[int] = None):
+    def init(self, kv_indices: npt.NDArray[np.int32], aux_index: Optional[int] = None):
         """
         Notify the prefill server about the kv indices and aux index
         """

sglang/srt/disaggregation/common/__init__.py CHANGED Viewed

@@ -1 +1,5 @@
-from .conn import CommonKVBootstrapServer, CommonKVManager, CommonKVReceiver
+from sglang.srt.disaggregation.common.conn import (
+    CommonKVBootstrapServer,
+    CommonKVManager,
+    CommonKVReceiver,
+)

sglang/srt/disaggregation/common/utils.py ADDED Viewed

@@ -0,0 +1,42 @@
+import threading
+from collections import deque
+from typing import List, Tuple
+import numpy as np
+import numpy.typing as npt
+class FastQueue:
+    def __init__(self):
+        self._buf = deque()
+        self._cond = threading.Condition()
+    def put(self, item):
+        with self._cond:
+            self._buf.append(item)
+            # wake up a thread of wait()
+            self._cond.notify()
+    def get(self):
+        with self._cond:
+            # if queue is empty  ,block until is notified()
+            while not self._buf:
+                self._cond.wait()
+            return self._buf.popleft()
+def group_concurrent_contiguous(
+    src_indices: npt.NDArray[np.int32], dst_indices: npt.NDArray[np.int32]
+) -> Tuple[List[npt.NDArray[np.int32]], List[npt.NDArray[np.int32]]]:
+    """Vectorised NumPy implementation."""
+    if src_indices.size == 0:
+        return [], []
+    brk = np.where((np.diff(src_indices) != 1) | (np.diff(dst_indices) != 1))[0] + 1
+    src_groups = np.split(src_indices, brk)
+    dst_groups = np.split(dst_indices, brk)
+    src_groups = [g.tolist() for g in src_groups]
+    dst_groups = [g.tolist() for g in dst_groups]
+    return src_groups, dst_groups

sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl