PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import List, Optional, Tuple
 import torch
 from sglang.srt.distributed import get_tp_group
+from sglang.srt.layers.dp_attention import get_attention_tp_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import get_token_ids_logprobs, get_top_logprobs
 from sglang.srt.managers.schedule_batch import ScheduleBatch
@@ -52,9 +53,12 @@ from sglang.srt.utils import (
     get_available_gpu_memory,
     get_bool_env_var,
     is_cuda,
+    is_npu,
     next_power_of_2,
 )
+_is_npu = is_npu()
 if is_cuda():
     from sgl_kernel import segment_packbits  # noqa: F401
@@ -117,7 +121,11 @@ class EAGLEWorker(TpModelWorker):
             self.hot_token_id = None
         # Init draft worker
-        with empty_context():
+        if server_args.enable_dp_attention and self.speculative_algorithm.is_eagle3():
+            ctx = draft_tp_context(get_attention_tp_group())
+        else:
+            ctx = empty_context()
+        with ctx:
             super().__init__(
                 server_args=server_args,
                 gpu_id=gpu_id,
@@ -200,7 +208,7 @@ class EAGLEWorker(TpModelWorker):
         self.cuda_graph_runner = None
         self.cuda_graph_runner_for_draft_extend = None
-        if self.server_args.disable_cuda_graph:
+        if self.server_args.disable_cuda_graph or _is_npu:
             return
         # Capture draft
@@ -940,7 +948,7 @@ class EAGLEWorker(TpModelWorker):
         draft_input.hidden_states = logits_output.hidden_states
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, disable=_is_npu)
 def get_last_loc_large_page_size_top_k_1(
     req_to_token: torch.Tensor,
     req_pool_indices: torch.Tensor,

sglang/srt/speculative/eagle_worker_v2.py CHANGED Viewed

@@ -4,7 +4,6 @@ import time
 from typing import List, Optional, Tuple
 import torch
-from torch.cuda import Stream as CudaStream
 from sglang.srt.environ import envs
 from sglang.srt.managers.schedule_batch import ModelWorkerBatch
@@ -38,18 +37,21 @@ from sglang.srt.utils.common import (
     empty_context,
     fast_topk,
     get_available_gpu_memory,
+    is_npu,
     next_power_of_2,
 )
+_is_npu = is_npu()
 logger = logging.getLogger(__name__)
 def _get_plan_stream(
     device: str,
-) -> Tuple[Optional[CudaStream], contextlib.AbstractContextManager]:
+) -> Tuple[any, contextlib.AbstractContextManager]:
     if envs.SGLANG_ENABLE_OVERLAP_PLAN_STREAM.get():
-        plan_stream: CudaStream = torch.get_device_module(device).Stream()
-        plan_stream_ctx = torch.cuda.stream(plan_stream)
+        plan_stream = torch.get_device_module(device).Stream()
+        plan_stream_ctx = torch.get_device_module(device).stream(plan_stream)
         return plan_stream, plan_stream_ctx
     else:
         return None, contextlib.nullcontext()
@@ -206,7 +208,7 @@ class EagleDraftWorker(BaseDraftWorker):
         self.cuda_graph_runner = None
         self.cuda_graph_runner_for_draft_extend = None
-        if self.server_args.disable_cuda_graph:
+        if self.server_args.disable_cuda_graph or _is_npu:
             return
         # Capture draft
@@ -456,7 +458,9 @@ class EagleDraftWorker(BaseDraftWorker):
             )
         if self.plan_stream:
-            torch.cuda.current_stream().wait_stream(self.plan_stream)
+            torch.get_device_module(self.device).current_stream().wait_stream(
+                self.plan_stream
+            )
         # Run draft extend batch in the main compute stream
         draft_logits_output = self.draft_runner.model.forward(
@@ -577,7 +581,9 @@ class EAGLEWorkerV2(BaseSpecWorker):
         # Since batch.seq_lens is allocated in another stream, we need
         # record_stream() to prevent pytorch gc and reuse the gpu memory
         # while forward_stream is still running.
-        batch.seq_lens.record_stream(torch.cuda.current_stream())
+        batch.seq_lens.record_stream(
+            torch.get_device_module(self.device).current_stream()
+        )
         # Parse args
         verify_input: EagleVerifyInput = batch.spec_info
@@ -596,7 +602,7 @@ class EAGLEWorkerV2(BaseSpecWorker):
         # Correct some buffers due to the overlap plan
         if self.plan_stream:
-            torch.cuda.current_stream().wait_stream(self.plan_stream)
+            torch.get_device_module().current_stream().wait_stream(self.plan_stream)
             # Some values such as custom_mask and position depend on the output of draft,
             # so the previous plan step used the wrong values. Here, we need to run the related
@@ -628,7 +634,7 @@ class EAGLEWorkerV2(BaseSpecWorker):
             accept_index,
         ) = verify_input.sample(batch, logits_output)
         new_seq_lens = batch.seq_lens + accept_length
-        verify_done = torch.cuda.Event()
+        verify_done = torch.get_device_module(self.device).Event()
         verify_done.record()
         all_verified_id = predict[accept_index]

sglang/srt/speculative/spec_info.py CHANGED Viewed

@@ -1,46 +1,320 @@
+from __future__ import annotations
+import threading
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from enum import IntEnum, auto
-from functools import lru_cache
-from typing import List, Tuple
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 from sglang.srt.managers.schedule_batch import ModelWorkerBatch
+DraftWorkerClass = Callable[..., Any]
+DraftWorkerFactory = Callable[..., Any]
-class SpeculativeAlgorithm(IntEnum):
-    NONE = auto()
-    EAGLE = auto()
-    EAGLE3 = auto()
-    STANDALONE = auto()
-    NGRAM = auto()
-    def is_none(self):
-        return self == SpeculativeAlgorithm.NONE
+class _SpeculativeAlgorithmMeta(type):
+    def __iter__(cls) -> Iterator["SpeculativeAlgorithm"]:
+        return iter(cls._registration_order)
-    def is_eagle(self):
-        return self == SpeculativeAlgorithm.EAGLE or self == SpeculativeAlgorithm.EAGLE3
-    def is_eagle3(self):
-        return self == SpeculativeAlgorithm.EAGLE3
+class SpeculativeAlgorithm(metaclass=_SpeculativeAlgorithmMeta):
+    """Registry-backed representation of speculative decoding algorithms."""
-    def is_standalone(self):
-        return self == SpeculativeAlgorithm.STANDALONE
+    __slots__ = ("name", "value", "_draft_worker_factory")
-    def is_ngram(self):
-        return self == SpeculativeAlgorithm.NGRAM
+    _registry_by_name: Dict[str, "SpeculativeAlgorithm"] = {}
+    _registry_by_value: Dict[int, "SpeculativeAlgorithm"] = {}
+    _registration_order: List["SpeculativeAlgorithm"] = []
+    _flags: DefaultDict[str, Set[int]] = defaultdict(set)
+    _next_value: int = 0
-    @lru_cache(maxsize=None)
-    @staticmethod
-    def from_string(name: str):
-        name_map = {
-            "EAGLE": SpeculativeAlgorithm.EAGLE,
-            "EAGLE3": SpeculativeAlgorithm.EAGLE3,
-            "STANDALONE": SpeculativeAlgorithm.STANDALONE,
-            "NGRAM": SpeculativeAlgorithm.NGRAM,
-            None: SpeculativeAlgorithm.NONE,
-        }
-        if name is not None:
-            name = name.upper()
-        return name_map[name]
+    def __init__(
+        self,
+        name: str,
+        value: int,
+        draft_worker_factory: Optional[DraftWorkerFactory] = None,
+    ):
+        self.name = name
+        self.value = value
+        self._draft_worker_factory = draft_worker_factory
+    def __repr__(self) -> str:  # pragma: no cover - trivial
+        return f"SpeculativeAlgorithm.{self.name}"
+    def __str__(self) -> str:  # pragma: no cover - trivial
+        return self.name
+    def __hash__(self) -> int:
+        return hash(self.value)
+    def __eq__(self, other: object) -> bool:
+        if isinstance(other, SpeculativeAlgorithm):
+            return self.value == other.value
+        return NotImplemented
+    def __int__(self) -> int:
+        return self.value
+    @classmethod
+    def register(
+        cls,
+        name: str,
+        *,
+        aliases: Optional[Sequence[str]] = None,
+        value: Optional[int] = None,
+        draft_worker_factory: Optional[DraftWorkerFactory] = None,
+    ) -> SpeculativeAlgorithm:
+        normalized_name = name.upper()
+        if normalized_name in cls._registry_by_name:
+            raise ValueError(
+                f"SpeculativeAlgorithm '{normalized_name}' already registered"
+            )
+        if value is None:
+            value = cls._next_value
+        cls._next_value = max(cls._next_value, value + 1)
+        algorithm = cls(
+            normalized_name,
+            value,
+            draft_worker_factory=draft_worker_factory,
+        )
+        cls._registry_by_name[normalized_name] = algorithm
+        cls._registry_by_value[value] = algorithm
+        cls._registration_order.append(algorithm)
+        setattr(cls, normalized_name, algorithm)
+        if aliases:
+            cls.register_aliases(algorithm, *aliases)
+        return algorithm
+    @classmethod
+    def register_aliases(cls, algorithm: SpeculativeAlgorithm, *aliases: str) -> None:
+        for alias in aliases:
+            cls._registry_by_name[alias.upper()] = algorithm
+    @classmethod
+    def register_draft_worker(
+        cls,
+        algorithm: SpeculativeAlgorithm | str,
+        factory: DraftWorkerFactory,
+    ) -> None:
+        algo = cls._ensure_algorithm(algorithm)
+        algo._draft_worker_factory = factory
+    @classmethod
+    def _ensure_algorithm(
+        cls, algorithm: SpeculativeAlgorithm | str
+    ) -> SpeculativeAlgorithm:
+        if isinstance(algorithm, SpeculativeAlgorithm):
+            return algorithm
+        if isinstance(algorithm, str):
+            return cls.from_string(algorithm)
+        raise TypeError(f"Unsupported algorithm identifier: {algorithm!r}")
+    @classmethod
+    def _add_flag(
+        cls, flag: str | Sequence[str], algorithm: SpeculativeAlgorithm | str
+    ) -> None:
+        algo = cls._ensure_algorithm(algorithm)
+        if isinstance(flag, str):
+            flag_iter = (flag,)
+        else:
+            flag_iter = flag
+        for flag_name in flag_iter:
+            cls._flags[flag_name.upper()].add(algo.value)
+    @classmethod
+    def from_string(cls, name: Optional[str]) -> SpeculativeAlgorithm:
+        if name is None:
+            return cls.NONE
+        try:
+            return cls._registry_by_name[name.upper()]
+        except KeyError as exc:
+            raise ValueError(f"Unknown speculative algorithm '{name}'") from exc
+    @classmethod
+    def from_value(cls, value: int) -> SpeculativeAlgorithm:
+        try:
+            return cls._registry_by_value[value]
+        except KeyError as exc:
+            raise ValueError(f"Unknown speculative algorithm id {value}") from exc
+    def _has_flag(self, flag: str) -> bool:
+        return self.value in type(self)._flags.get(flag.upper(), set())
+    def is_none(self) -> bool:
+        return self is SpeculativeAlgorithm.NONE
+    def is_eagle(self) -> bool:
+        return self._has_flag("EAGLE")
+    def is_eagle3(self) -> bool:
+        return self._has_flag("EAGLE3")
+    def is_standalone(self) -> bool:
+        return self._has_flag("STANDALONE")
+    def is_ngram(self) -> bool:
+        return self._has_flag("NGRAM")
+    def create_draft_worker(self, **factory_kwargs: Any) -> Any:
+        if self._draft_worker_factory is None:
+            return None
+        return self._draft_worker_factory(self, **factory_kwargs)
+# Registry helpers backed by `SpeculativeAlgorithm`.
+_LOCK = threading.RLock()
+_REGISTERED_WORKERS: Dict[SpeculativeAlgorithm, DraftWorkerClass] = {}
+_FLAG_MARKERS: Dict[str, Callable[[Union[SpeculativeAlgorithm, str]], None]] = {
+    "EAGLE": lambda algorithm: SpeculativeAlgorithm._add_flag("EAGLE", algorithm),
+    "EAGLE3": lambda algorithm: SpeculativeAlgorithm._add_flag("EAGLE3", algorithm),
+    "STANDALONE": lambda algorithm: SpeculativeAlgorithm._add_flag(
+        "STANDALONE", algorithm
+    ),
+    "NGRAM": lambda algorithm: SpeculativeAlgorithm._add_flag("NGRAM", algorithm),
+}
+def _wrap_worker_class(worker_cls: DraftWorkerClass) -> DraftWorkerFactory:
+    def _factory(_: SpeculativeAlgorithm, **kwargs: Any) -> Any:
+        return worker_cls(**kwargs)
+    return _factory
+def register_speculative_algorithm(
+    name: str,
+    worker_cls: DraftWorkerClass,
+    *,
+    aliases: Optional[Sequence[str]] = None,
+    flags: Optional[Iterable[str]] = None,
+    value: Optional[int] = None,
+    override_worker: bool = False,
+) -> SpeculativeAlgorithm:
+    """Register a speculative algorithm and the associated draft worker class.
+    Example:
+        >>> from sglang.srt.speculative.spec_info import register_speculative_algorithm
+        >>> register_speculative_algorithm("MY_ALGO", MyDraftWorker, flags=("EAGLE",))
+    """
+    name_upper = name.upper()
+    with _LOCK:
+        try:
+            algorithm = SpeculativeAlgorithm.from_string(name_upper)
+            exists = True
+        except ValueError:
+            algorithm = SpeculativeAlgorithm.register(
+                name_upper,
+                aliases=aliases,
+                value=value,
+            )
+            SpeculativeAlgorithm.register_draft_worker(
+                algorithm, _wrap_worker_class(worker_cls)
+            )
+            exists = False
+        if exists:
+            if aliases:
+                SpeculativeAlgorithm.register_aliases(algorithm, *aliases)
+            if not override_worker and algorithm in _REGISTERED_WORKERS:
+                raise ValueError(
+                    f"Worker already registered for {algorithm!r}. "
+                    "Pass override_worker=True to replace it."
+                )
+            SpeculativeAlgorithm.register_draft_worker(
+                algorithm, _wrap_worker_class(worker_cls)
+            )
+        _REGISTERED_WORKERS[algorithm] = worker_cls
+        if flags:
+            for flag in flags:
+                marker = _FLAG_MARKERS.get(flag.upper())
+                if marker is None:
+                    raise ValueError(f"Unsupported flag '{flag}'")
+                marker(algorithm)
+        return algorithm
+def list_registered_workers() -> Dict[str, DraftWorkerClass]:
+    """Return a snapshot of registered speculative worker classes keyed by algorithm name."""
+    with _LOCK:
+        return {algo.name: cls for algo, cls in _REGISTERED_WORKERS.items()}
+def _create_eagle_worker(**kwargs: Any) -> Any:
+    enable_overlap = kwargs.pop("enable_overlap", False)
+    if enable_overlap:
+        from sglang.srt.speculative.eagle_worker_v2 import EAGLEWorkerV2
+        return EAGLEWorkerV2(**kwargs)
+    from sglang.srt.speculative.eagle_worker import EAGLEWorker
+    return EAGLEWorker(**kwargs)
+def _create_standalone_worker(**kwargs: Any) -> Any:
+    from sglang.srt.speculative.standalone_worker import StandaloneWorker
+    return StandaloneWorker(**kwargs)
+def _create_ngram_worker(**kwargs: Any) -> Any:
+    from sglang.srt.speculative.ngram_worker import NGRAMWorker
+    return NGRAMWorker(**kwargs)
+# Register built-in algorithms.
+# Third-party integrations should import `SpeculativeAlgorithm` and either
+# call `register_speculative_algorithm` or use the helpers below to attach
+# additional draft workers.
+SpeculativeAlgorithm.register("NONE")
+register_speculative_algorithm(
+    "EAGLE",
+    aliases=("NEXTN",),
+    worker_cls=_create_eagle_worker,
+    flags=("EAGLE",),
+)
+register_speculative_algorithm(
+    "EAGLE3",
+    worker_cls=_create_eagle_worker,
+    flags=("EAGLE", "EAGLE3"),
+)
+register_speculative_algorithm(
+    "STANDALONE",
+    worker_cls=_create_standalone_worker,
+    flags=("STANDALONE",),
+)
+register_speculative_algorithm(
+    "NGRAM",
+    worker_cls=_create_ngram_worker,
+    flags=("NGRAM",),
+)
 class SpecInputType(IntEnum):

sglang/srt/speculative/spec_utils.py CHANGED Viewed

@@ -19,16 +19,22 @@ from sglang.srt.distributed.parallel_state import (
 from sglang.srt.environ import envs
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import Req
-from sglang.srt.utils import is_cuda, is_hip
+from sglang.srt.utils import is_cuda, is_hip, is_npu, next_power_of_2
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_npu = is_npu()
 if TYPE_CHECKING:
     from sglang.srt.speculative.eagle_info import EagleVerifyInput
-if is_cuda():
+if _is_cuda:
     from sgl_kernel import fast_topk
-elif is_hip():
+elif _is_hip:
     from sgl_kernel import fast_topk
+else:
+    from sglang.srt.utils.common import fast_topk
 logger = logging.getLogger(__name__)
@@ -39,7 +45,7 @@ SIMULATE_ACC_LEN = envs.SGLANG_SIMULATE_ACC_LEN.get()  # turn off if < 0
 SIMULATE_ACC_METHOD = envs.SGLANG_SIMULATE_ACC_METHOD.get()
 TREE_TRAVERSE_TIME_THRESHOLD = 1  # TODO: set this properly
-TREE_SPEC_KERNEL_AVAILABLE = is_cuda()  # This kernel is only available for CUDA now
+TREE_SPEC_KERNEL_AVAILABLE = _is_cuda  # This kernel is only available for CUDA now
 @triton.jit
@@ -103,6 +109,36 @@ def assign_req_to_token_pool(
         load_offset += BLOCK_SIZE
+def assign_req_to_token_pool_func(
+    req_pool_indices: torch.Tensor,
+    req_to_token: torch.Tensor,
+    start_offset: torch.Tensor,
+    end_offset: torch.Tensor,
+    out_cache_loc: torch.Tensor,
+    batch_size: int,
+):
+    if _is_cuda or _is_hip:
+        assign_req_to_token_pool[(batch_size,)](
+            req_pool_indices,
+            req_to_token,
+            start_offset,
+            end_offset,
+            out_cache_loc,
+            req_to_token.shape[1],
+            next_power_of_2(batch_size),
+        )
+    elif _is_npu:
+        import sgl_kernel_npu  # noqa: F401
+        torch.ops.npu.cache_loc_assign(
+            req_pool_indices,
+            req_to_token,
+            start_offset,
+            end_offset,
+            out_cache_loc,
+        )
 @triton.jit
 def assign_draft_cache_locs(
     req_pool_indices,
@@ -331,7 +367,7 @@ def get_target_cache_loc(
     )
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, disable=_is_npu)
 def get_src_tgt_cache_loc(
     seq_lens: torch.Tensor,
     out_cache_loc: torch.Tensor,
@@ -381,7 +417,7 @@ def filter_finished_cache_loc_kernel(
     )
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, disable=_is_npu)
 def create_accept_length_filter(
     accept_length: torch.Tensor,
     unfinished_index_device: torch.Tensor,
@@ -395,7 +431,7 @@ def create_accept_length_filter(
     return accept_length_filter
-@torch.compile(dynamic=True)
+@torch.compile(dynamic=True, disable=_is_npu)
 def select_top_k_tokens(
     i: int,
     topk_p: torch.Tensor,
@@ -413,7 +449,7 @@ def select_top_k_tokens(
         tree_info = (
             topk_p.unsqueeze(1),  # shape: (b, 1, topk)
             topk_index,  # shape: (b, topk)
-            torch.arange(-1, topk, dtype=torch.long, device="cuda")
+            torch.arange(-1, topk, dtype=torch.long, device=hidden_states.device)
             .unsqueeze(0)
             .repeat(topk_p.shape[0], 1),  # shape: (b, topk + 1)
         )

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl