PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -35,8 +35,10 @@ from sglang.srt.distributed import (
     init_distributed_environment,
     initialize_model_parallel,
     set_custom_all_reduce,
+    set_mscclpp_all_reduce,
 )
 from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
+from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
 from sglang.srt.layers.dp_attention import (
     get_attention_tp_group,
     get_attention_tp_size,
@@ -50,6 +52,7 @@ from sglang.srt.layers.quantization.deep_gemm import (
 )
 from sglang.srt.layers.sampler import Sampler
 from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
+from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.lora.lora_manager import LoRAManager
 from sglang.srt.managers.eplb_manager import EPLBManager
 from sglang.srt.managers.expert_distribution import (
@@ -63,7 +66,10 @@ from sglang.srt.managers.expert_location import (
     get_global_expert_location_metadata,
     set_global_expert_location_metadata,
 )
-from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.managers.schedule_batch import (
+    GLOBAL_SERVER_ARGS_KEYS,
+    global_server_args_dict,
+)
 from sglang.srt.mem_cache.memory_pool import (
     DoubleSparseTokenToKVPool,
     MHATokenToKVPool,
@@ -72,15 +78,11 @@ from sglang.srt.mem_cache.memory_pool import (
     TokenToKVPoolAllocator,
 )
 from sglang.srt.mem_cache.paged_allocator import PagedTokenToKVPoolAllocator
-from sglang.srt.model_executor import expert_location_updater
 from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
+from sglang.srt.model_executor.expert_location_updater import ExpertLocationUpdater
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
 from sglang.srt.model_loader import get_model
-from sglang.srt.model_loader.loader import (
-    DefaultModelLoader,
-    device_loading_context,
-    get_model_loader,
-)
+from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
 from sglang.srt.model_loader.utils import set_default_torch_dtype
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.patch_torch import monkey_patch_torch_reductions
@@ -90,6 +92,7 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
 from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
     MultiprocessingSerializer,
+    cpu_has_amx_support,
     enable_show_time_cost,
     get_available_gpu_memory,
     get_bool_env_var,
@@ -188,32 +191,10 @@ class ModelRunner:
         # Global vars
         global_server_args_dict.update(
-            {
-                "attention_backend": server_args.attention_backend,
-                "debug_tensor_dump_inject": server_args.debug_tensor_dump_inject,
-                "debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder,
-                "deepep_mode": server_args.deepep_mode,
-                "device": server_args.device,
-                "disable_chunked_prefix_cache": server_args.disable_chunked_prefix_cache,
-                "disable_radix_cache": server_args.disable_radix_cache,
-                "enable_nan_detection": server_args.enable_nan_detection,
-                "enable_dp_attention": server_args.enable_dp_attention,
-                "enable_dp_lm_head": server_args.enable_dp_lm_head,
-                "enable_ep_moe": server_args.enable_ep_moe,
-                "enable_deepep_moe": server_args.enable_deepep_moe,
-                "deepep_config": server_args.deepep_config,
-                "flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
-                "moe_dense_tp_size": server_args.moe_dense_tp_size,
-                "ep_dispatch_algorithm": server_args.ep_dispatch_algorithm,
-                "n_share_experts_fusion": server_args.n_share_experts_fusion,
-                "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
-                "torchao_config": server_args.torchao_config,
-                "sampling_backend": server_args.sampling_backend,
-                "speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
-                "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
+            {k: getattr(server_args, k) for k in GLOBAL_SERVER_ARGS_KEYS}
+            | {
+                # TODO it is indeed not a "server args"
                 "use_mla_backend": self.use_mla_backend,
-                "mm_attention_backend": server_args.mm_attention_backend,
-                "ep_num_redundant_experts": server_args.ep_num_redundant_experts,
             }
         )
@@ -265,6 +246,7 @@ class ModelRunner:
             if self.server_args.enable_eplb and (not self.is_draft_worker)
             else None
         )
+        self.expert_location_updater = ExpertLocationUpdater()
         # Load the model
         self.sampler = Sampler()
@@ -314,6 +296,16 @@ class ModelRunner:
     def model_specific_adjustment(self):
         server_args = self.server_args
+        if (
+            server_args.attention_backend == "intel_amx"
+            and server_args.device == "cpu"
+            and not cpu_has_amx_support()
+        ):
+            logger.info(
+                "The current platform does not support Intel AMX, will fallback to torch_native backend."
+            )
+            server_args.attention_backend = "torch_native"
         if server_args.attention_backend is None:
             """
             Auto select the fastest attention backend.
@@ -323,7 +315,8 @@ class ModelRunner:
                 1.2 In other cases, we will use flashinfer if available, otherwise use triton.
             2. Models with MLA Architecture and using FA3
                 2.1 We will use FA3 backend on hopper.
-                2.2 Otherwise, we will use triton backend.
+                2.2 We will use Flashinfer backend on blackwell.
+                2.3 Otherwise, we will use triton backend.
             """
             if not self.use_mla_backend:
@@ -344,6 +337,17 @@ class ModelRunner:
                 # MLA architecture
                 if is_hopper_with_cuda_12_3():
                     server_args.attention_backend = "fa3"
+                elif is_sm100_supported():
+                    server_args.attention_backend = "flashinfer"
+                elif _is_hip:
+                    head_num = self.model_config.get_num_kv_heads(self.tp_size)
+                    # TODO current aiter only support head number 16 or 128 head number
+                    if (
+                        head_num == 128 or head_num == 16
+                    ) and self.spec_algorithm.is_none():
+                        server_args.attention_backend = "aiter"
+                    else:
+                        server_args.attention_backend = "triton"
                 else:
                     server_args.attention_backend = "triton"
             logger.info(
@@ -352,6 +356,7 @@ class ModelRunner:
         elif self.use_mla_backend:
             if server_args.device != "cpu":
                 if server_args.attention_backend in [
+                    "aiter",
                     "flashinfer",
                     "fa3",
                     "triton",
@@ -366,7 +371,10 @@ class ModelRunner:
                         f"Invalid attention backend for MLA: {server_args.attention_backend}"
                     )
             else:
-                raise ValueError("MLA optimization not supported on CPU.")
+                if server_args.attention_backend != "intel_amx":
+                    raise ValueError(
+                        "MLA optimization not supported on CPU except for intel_amx backend."
+                    )
         if (
             server_args.attention_backend == "fa3"
@@ -412,6 +420,10 @@ class ModelRunner:
         if not server_args.disable_chunked_prefix_cache:
             logger.info("Chunked prefix cache is turned on.")
+        if server_args.attention_backend == "aiter":
+            if self.model_config.context_len > 8192:
+                self.mem_fraction_static *= 0.85
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
@@ -443,6 +455,7 @@ class ModelRunner:
         else:
             dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
         set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
+        set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
         if not self.is_draft_worker:
             # Only initialize the distributed environment on the target model worker.
@@ -592,11 +605,14 @@ class ModelRunner:
             ) from None
     def update_expert_location(
-        self, new_expert_location_metadata: ExpertLocationMetadata
+        self,
+        new_expert_location_metadata: ExpertLocationMetadata,
+        update_layer_ids: List[int],
     ):
-        expert_location_updater.update_expert_location(
+        self.expert_location_updater.update(
             self.model.routed_experts_weights_of_layer,
             new_expert_location_metadata,
+            update_layer_ids=update_layer_ids,
             nnodes=self.server_args.nnodes,
             rank=self.tp_rank,
         )
@@ -900,12 +916,26 @@ class ModelRunner:
             )
         if self.req_to_token_pool is None:
-            self.req_to_token_pool = ReqToTokenPool(
-                size=max_num_reqs + 1,
-                max_context_len=self.model_config.context_len + 4,
-                device=self.device,
-                enable_memory_saver=self.server_args.enable_memory_saver,
-            )
+            if self.server_args.disaggregation_mode == "decode":
+                from sglang.srt.disaggregation.decode import DecodeReqToTokenPool
+                # subscribe memory for pre-allocated requests
+                # if max_num_reqs <= 32, we pre-allocate 2x requests
+                pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0
+                self.req_to_token_pool = DecodeReqToTokenPool(
+                    size=max_num_reqs,
+                    max_context_len=self.model_config.context_len + 4,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    pre_alloc_size=pre_alloc_size,
+                )
+            else:
+                self.req_to_token_pool = ReqToTokenPool(
+                    size=max_num_reqs,
+                    max_context_len=self.model_config.context_len + 4,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                )
         else:
             # Draft worker shares req_to_token_pool with the target worker.
             assert self.is_draft_worker
@@ -990,6 +1020,13 @@ class ModelRunner:
     def init_attention_backend(self):
         """Init attention kernel backend."""
+        if self.server_args.enable_two_batch_overlap:
+            self.attn_backend = TboAttnBackend.init_new(self._get_attention_backend)
+        else:
+            self.attn_backend = self._get_attention_backend()
+    # TODO unify with 6338
+    def _get_attention_backend(self):
         if self.server_args.attention_backend == "flashinfer":
             if not self.use_mla_backend:
                 from sglang.srt.layers.attention.flashinfer_backend import (
@@ -999,22 +1036,18 @@ class ModelRunner:
                 # Init streams
                 if self.server_args.speculative_algorithm == "EAGLE":
                     self.plan_stream_for_flashinfer = torch.cuda.Stream()
-                self.attn_backend = FlashInferAttnBackend(self)
+                return FlashInferAttnBackend(self)
             else:
                 from sglang.srt.layers.attention.flashinfer_mla_backend import (
                     FlashInferMLAAttnBackend,
                 )
-                self.attn_backend = FlashInferMLAAttnBackend(self)
+                return FlashInferMLAAttnBackend(self)
         elif self.server_args.attention_backend == "aiter":
             from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
-            self.attn_backend = AiterAttnBackend(self)
+            return AiterAttnBackend(self)
         elif self.server_args.attention_backend == "triton":
-            assert self.sliding_window_size is None, (
-                "Window attention is not supported in the triton attention backend. "
-                "Please use `--attention-backend flashinfer`."
-            )
             assert not self.model_config.is_encoder_decoder, (
                 "Cross attention is not supported in the triton attention backend. "
                 "Please use `--attention-backend flashinfer`."
@@ -1024,21 +1057,21 @@ class ModelRunner:
                     DoubleSparseAttnBackend,
                 )
-                self.attn_backend = DoubleSparseAttnBackend(self)
+                return DoubleSparseAttnBackend(self)
             else:
                 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
-                self.attn_backend = TritonAttnBackend(self)
+                return TritonAttnBackend(self)
         elif self.server_args.attention_backend == "torch_native":
             from sglang.srt.layers.attention.torch_native_backend import (
                 TorchNativeAttnBackend,
             )
-            self.attn_backend = TorchNativeAttnBackend(self)
+            return TorchNativeAttnBackend(self)
         elif self.server_args.attention_backend == "flashmla":
             from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
-            self.attn_backend = FlashMLABackend(self)
+            return FlashMLABackend(self)
         elif self.server_args.attention_backend == "fa3":
             assert (
                 torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
@@ -1050,13 +1083,20 @@ class ModelRunner:
                 FlashAttentionBackend,
             )
-            self.attn_backend = FlashAttentionBackend(self)
+            return FlashAttentionBackend(self)
         elif self.server_args.attention_backend == "cutlass_mla":
             from sglang.srt.layers.attention.cutlass_mla_backend import (
                 CutlassMLABackend,
             )
-            self.attn_backend = CutlassMLABackend(self)
+            return CutlassMLABackend(self)
+        elif self.server_args.attention_backend == "intel_amx":
+            from sglang.srt.layers.attention.intel_amx_backend import (
+                IntelAMXAttnBackend,
+            )
+            logger.info(f"Intel AMX attention backend is enabled.")
+            return IntelAMXAttnBackend(self)
         else:
             raise ValueError(
                 f"Invalid attention backend: {self.server_args.attention_backend}"
@@ -1174,7 +1214,7 @@ class ModelRunner:
             )
         if self.eplb_manager is not None:
-            self.eplb_manager.on_forward_pass_end(self.forward_pass_id)
+            self.eplb_manager.on_forward_pass_end()
         return output

sglang/srt/model_loader/utils.py CHANGED Viewed

@@ -2,12 +2,17 @@
 """Utilities for selecting and loading models."""
 import contextlib
+import logging
 from typing import Tuple, Type
 import torch
+import transformers
 from torch import nn
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
-from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.configs.model_config import ModelConfig, ModelImpl
+logger = logging.getLogger(__name__)
 @contextlib.contextmanager
@@ -19,6 +24,61 @@ def set_default_torch_dtype(dtype: torch.dtype):
     torch.set_default_dtype(old_dtype)
+def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str]):
+    for i, arch in enumerate(architectures):
+        if arch == "TransformersForCausalLM":
+            continue
+        auto_map: dict[str, str] = (
+            getattr(model_config.hf_config, "auto_map", None) or dict()
+        )
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name: get_class_from_dynamic_module(
+                module, model_config.model_path, revision=model_config.revision
+            )
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+        }
+        model_module = getattr(transformers, arch, None)
+        if model_module is None:
+            if "AutoModel" not in auto_map:
+                raise ValueError(
+                    f"Cannot find model module. '{arch}' is not a registered "
+                    "model in the Transformers library (only relevant if the "
+                    "model is meant to be in Transformers) and 'AutoModel' is "
+                    "not present in the model config's 'auto_map' (relevant "
+                    "if the model is custom)."
+                )
+            model_module = auto_modules["AutoModel"]
+        if model_config.impl == ModelImpl.TRANSFORMERS:
+            if not model_module.is_backend_compatible():
+                raise ValueError(
+                    f"The Transformers implementation of {arch} is not "
+                    "compatible with vLLM."
+                )
+            architectures[i] = "TransformersForCausalLM"
+        if model_config.impl == ModelImpl.AUTO:
+            if not model_module.is_backend_compatible():
+                raise ValueError(
+                    f"{arch} has no SGlang implementation and the Transformers "
+                    "implementation is not compatible with SGLang."
+                )
+            logger.warning(
+                "%s has no SGLang implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.",
+                arch,
+            )
+            architectures[i] = "TransformersForCausalLM"
+    return architectures
 def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
     from sglang.srt.models.registry import ModelRegistry
@@ -34,6 +94,12 @@ def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module],
     ):
         architectures = ["QuantMixtralForCausalLM"]
+    supported_archs = ModelRegistry.get_supported_archs()
+    is_native_supported = any(arch in supported_archs for arch in architectures)
+    if not is_native_supported or model_config.impl == ModelImpl.TRANSFORMERS:
+        architectures = resolve_transformers_arch(model_config, architectures)
     return ModelRegistry.resolve_model_cls(architectures)

sglang/srt/models/deepseek_nextn.py CHANGED Viewed

@@ -122,7 +122,7 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config
-        self.determine_n_share_experts_fusion("DeepseekV3ForCausalLMNextN")
+        self.determine_num_fused_shared_experts("DeepseekV3ForCausalLMNextN")
         self.model = DeepseekModelNextN(
             config, quant_config, prefix=add_prefix("model", prefix)

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl