PyPI - sglang - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl - Mend

sglang 0.5.3py3-none-any.whl → 0.5.3.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

sglang/bench_one_batch.py +0 -2
sglang/bench_serving.py +224 -127
sglang/compile_deep_gemm.py +3 -0
sglang/launch_server.py +0 -14
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/falcon_h1.py +12 -58
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +68 -31
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/qwen3_next.py +11 -43
sglang/srt/disaggregation/decode.py +7 -18
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/nixl/conn.py +55 -23
sglang/srt/disaggregation/prefill.py +17 -32
sglang/srt/entrypoints/engine.py +2 -2
sglang/srt/entrypoints/grpc_request_manager.py +10 -23
sglang/srt/entrypoints/grpc_server.py +220 -80
sglang/srt/entrypoints/http_server.py +49 -1
sglang/srt/entrypoints/openai/protocol.py +159 -31
sglang/srt/entrypoints/openai/serving_chat.py +13 -71
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +4 -0
sglang/srt/function_call/function_call_parser.py +8 -6
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
sglang/srt/layers/attention/attention_registry.py +31 -22
sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
sglang/srt/layers/attention/flashattention_backend.py +0 -1
sglang/srt/layers/attention/flashinfer_backend.py +223 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/triton_backend.py +1 -1
sglang/srt/layers/logits_processor.py +136 -6
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
sglang/srt/layers/moe/ep_moe/layer.py +8 -286
sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/utils.py +7 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/fp8.py +84 -18
sglang/srt/layers/quantization/modelopt_quant.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/w4afp8.py +2 -16
sglang/srt/lora/lora_manager.py +0 -8
sglang/srt/managers/overlap_utils.py +18 -16
sglang/srt/managers/schedule_batch.py +119 -90
sglang/srt/managers/schedule_policy.py +1 -1
sglang/srt/managers/scheduler.py +213 -126
sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
sglang/srt/managers/tokenizer_manager.py +270 -53
sglang/srt/managers/tp_worker.py +39 -28
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +162 -68
sglang/srt/mem_cache/radix_cache.py +8 -3
sglang/srt/mem_cache/swa_radix_cache.py +70 -14
sglang/srt/model_executor/cuda_graph_runner.py +1 -1
sglang/srt/model_executor/forward_batch_info.py +4 -18
sglang/srt/model_executor/model_runner.py +55 -51
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +187 -6
sglang/srt/model_loader/weight_utils.py +3 -0
sglang/srt/models/falcon_h1.py +11 -9
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/grok.py +5 -13
sglang/srt/models/mixtral.py +1 -3
sglang/srt/models/mllama4.py +11 -1
sglang/srt/models/nemotron_h.py +514 -0
sglang/srt/models/utils.py +5 -1
sglang/srt/sampling/sampling_batch_info.py +11 -9
sglang/srt/server_args.py +100 -33
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_utils.py +0 -1
sglang/srt/two_batch_overlap.py +1 -0
sglang/srt/utils/common.py +18 -0
sglang/srt/utils/hf_transformers_utils.py +2 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +40 -0
sglang/test/simple_eval_longbench_v2.py +332 -0
sglang/test/test_cutlass_w4a8_moe.py +9 -19
sglang/test/test_deterministic.py +18 -2
sglang/test/test_deterministic_utils.py +81 -0
sglang/test/test_disaggregation_utils.py +63 -0
sglang/test/test_utils.py +32 -11
sglang/version.py +1 -1
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -75,10 +75,6 @@ class ForwardMode(IntEnum):
     # Used in speculative decoding: extend a batch in the draft model.
     DRAFT_EXTEND = auto()
-    # A dummy first batch to start the pipeline for overlap scheduler.
-    # It is now used for triggering the sampling_info_done event for the first prefill batch.
-    DUMMY_FIRST = auto()
     # Split Prefill for PD multiplexing
     SPLIT_PREFILL = auto()
@@ -128,9 +124,6 @@ class ForwardMode(IntEnum):
     def is_cpu_graph(self):
         return self == ForwardMode.DECODE
-    def is_dummy_first(self):
-        return self == ForwardMode.DUMMY_FIRST
     def is_split_prefill(self):
         return self == ForwardMode.SPLIT_PREFILL
@@ -285,6 +278,9 @@ class ForwardBatch:
     can_run_dp_cuda_graph: bool = False
     global_forward_mode: Optional[ForwardMode] = None
+    # Whether this batch is prefill-only (no token generation needed)
+    is_prefill_only: bool = False
     # Speculative decoding
     spec_info: Optional[SpecInput] = None
     spec_algorithm: SpeculativeAlgorithm = None
@@ -332,6 +328,7 @@ class ForwardBatch:
             is_extend_in_batch=batch.is_extend_in_batch,
             can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
             global_forward_mode=batch.global_forward_mode,
+            is_prefill_only=batch.is_prefill_only,
             lora_ids=batch.lora_ids,
             sampling_info=batch.sampling_info,
             req_to_token_pool=model_runner.req_to_token_pool,
@@ -902,17 +899,6 @@ class ForwardBatch:
         return self.tbo_split_seq_index is not None
-@dataclass
-class ForwardBatchOutput:
-    # FIXME(lsyin): unify the forward batch output between different spec and parallelism
-    # need to be more organized
-    logits_output: Optional[torch.Tensor] = None
-    next_token_ids: Optional[torch.Tensor] = None
-    num_accepted_tokens: Optional[int] = None
-    pp_proxy_tensors: Optional[PPProxyTensors] = None
-    can_run_cuda_graph: bool = False
 def enable_num_token_non_padded(server_args):
     return get_moe_expert_parallel_world_size() > 1

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -29,6 +29,7 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.distributed as dist
+from sglang.srt.configs import FalconH1Config, NemotronHConfig, Qwen3NextConfig
 from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig, LoadFormat
 from sglang.srt.configs.model_config import (
@@ -354,8 +355,9 @@ class ModelRunner:
             if architectures and not any("Llama4" in arch for arch in architectures):
                 self.is_hybrid = self.model_config.is_hybrid = True
-        if self.is_hybrid_gdn:
-            logger.warning("Hybrid GDN model detected, disable radix cache")
+        if config := self.mambaish_config:
+            class_name = config.__class__.__name__
+            logger.warning(f"{class_name} model detected, disable radix cache")
             self.server_args.disable_radix_cache = True
             if self.server_args.max_mamba_cache_size is None:
                 if self.server_args.max_running_requests is not None:
@@ -364,6 +366,7 @@ class ModelRunner:
                     )
                 else:
                     self.server_args.max_mamba_cache_size = 512
+        if self.hybrid_gdn_config is not None:
             self.server_args.max_mamba_cache_size = (
                 self.server_args.max_mamba_cache_size
                 // (
@@ -880,7 +883,7 @@ class ModelRunner:
         load_config = LoadConfig(load_format=load_format)
         # Only support DefaultModelLoader for now
-        loader = get_model_loader(load_config)
+        loader = get_model_loader(load_config, self.model_config)
         if not isinstance(loader, DefaultModelLoader):
             message = f"Failed to get model loader: {loader}."
             return False, message
@@ -1267,8 +1270,8 @@ class ModelRunner:
                 "num_nextn_predict_layers",
                 self.num_effective_layers,
             )
-        elif self.is_hybrid_gdn:
-            num_layers = len(self.model_config.hf_config.full_attention_layer_ids)
+        elif config := self.mambaish_config:
+            num_layers = len(config.full_attention_layer_ids)
         else:
             num_layers = self.num_effective_layers
         if self.use_mla_backend:
@@ -1277,6 +1280,17 @@ class ModelRunner:
                 * num_layers
                 * torch._utils._element_size(self.kv_cache_dtype)
             )
+            # Add indexer KV cache overhead for NSA models (DeepSeek V3.2)
+            if is_deepseek_nsa(self.model_config.hf_config):
+                index_head_dim = get_nsa_index_head_dim(self.model_config.hf_config)
+                indexer_size_per_token = (
+                    index_head_dim
+                    + index_head_dim // NSATokenToKVPool.quant_block_size * 4
+                )
+                element_size = torch._utils._element_size(
+                    NSATokenToKVPool.index_k_with_scale_buffer_dtype
+                )
+                cell_size += indexer_size_per_token * num_layers * element_size
         else:
             cell_size = (
                 self.model_config.get_num_kv_heads(get_attention_tp_size())
@@ -1288,22 +1302,32 @@ class ModelRunner:
         rest_memory = available_gpu_memory - total_gpu_memory * (
             1 - self.mem_fraction_static
         )
-        if self.is_hybrid_gdn:
+        if config := self.mambaish_config:
             rest_memory -= (
                 self.server_args.max_mamba_cache_size
-                * self.model_config.hf_config.mamba_cache_per_req
+                * config.mamba2_cache_params.mamba_cache_per_req
                 / (1 << 30)
             )
         max_num_token = int(rest_memory * (1 << 30) // cell_size)
         return max_num_token
     @property
-    def is_hybrid_gdn(self):
-        return self.model_config.hf_config.architectures[0] in [
-            "Qwen3NextForCausalLM",
-            "Qwen3NextForCausalLMMTP",
-            "FalconH1ForCausalLM",
-        ]
+    def hybrid_gdn_config(self):
+        config = self.model_config.hf_config
+        if isinstance(config, Qwen3NextConfig):
+            return config
+        return None
+    @property
+    def mamba2_config(self):
+        config = self.model_config.hf_config
+        if isinstance(config, FalconH1Config | NemotronHConfig):
+            return config
+        return None
+    @property
+    def mambaish_config(self):
+        return self.mamba2_config or self.hybrid_gdn_config
     def set_num_token_hybrid(self):
         if (
@@ -1438,7 +1462,7 @@ class ModelRunner:
                 ),
                 4096,
             )
-        if self.is_hybrid_gdn:
+        if self.mambaish_config is not None:
             max_num_reqs = min(max_num_reqs, self.server_args.max_mamba_cache_size)
         if self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone():
@@ -1519,26 +1543,14 @@ class ModelRunner:
                     enable_memory_saver=self.server_args.enable_memory_saver,
                     pre_alloc_size=pre_alloc_size,
                 )
-            elif self.is_hybrid_gdn:
-                config = self.model_config.hf_config
-                (
-                    conv_state_shape,
-                    temporal_state_shape,
-                    conv_dtype,
-                    ssm_dtype,
-                    mamba_layers,
-                ) = config.hybrid_gdn_params
+            elif config := self.mambaish_config:
                 self.req_to_token_pool = HybridReqToTokenPool(
                     size=max_num_reqs,
                     max_context_len=self.model_config.context_len
                     + extra_max_context_len,
                     device=self.device,
                     enable_memory_saver=self.server_args.enable_memory_saver,
-                    conv_state_shape=conv_state_shape,
-                    temporal_state_shape=temporal_state_shape,
-                    conv_dtype=conv_dtype,
-                    ssm_dtype=ssm_dtype,
-                    mamba_layers=mamba_layers,
+                    cache_params=config.mamba2_cache_params,
                     speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
                 )
             else:
@@ -1640,7 +1652,7 @@ class ModelRunner:
                     enable_kvcache_transpose=False,
                     device=self.device,
                 )
-            elif self.is_hybrid_gdn:
+            elif config := self.mambaish_config:
                 self.token_to_kv_pool = HybridLinearKVPool(
                     page_size=self.page_size,
                     size=self.max_total_num_tokens,
@@ -1651,9 +1663,7 @@ class ModelRunner:
                     head_dim=self.model_config.head_dim,
                     # if draft worker, we only need 1 attention layer's kv pool
                     full_attention_layer_ids=(
-                        [0]
-                        if self.is_draft_worker
-                        else self.model_config.hf_config.full_attention_layer_ids
+                        [0] if self.is_draft_worker else config.full_attention_layer_ids
                     ),
                     enable_kvcache_transpose=False,
                     device=self.device,
@@ -1672,13 +1682,17 @@ class ModelRunner:
                     enable_memory_saver=self.server_args.enable_memory_saver,
                     start_layer=self.start_layer,
                     end_layer=self.end_layer,
+                    enable_kv_cache_copy=(
+                        self.server_args.speculative_algorithm is not None
+                    ),
                 )
         # Initialize token_to_kv_pool_allocator
         need_sort = self.server_args.disaggregation_mode in ("decode", "prefill")
         if self.token_to_kv_pool_allocator is None:
             if _is_npu and (
-                self.server_args.attention_backend == "ascend" or self.is_hybrid_gdn
+                self.server_args.attention_backend == "ascend"
+                or self.hybrid_gdn_config is not None
             ):
                 self.token_to_kv_pool_allocator = AscendPagedTokenToKVPoolAllocator(
                     self.max_total_num_tokens,
@@ -1743,16 +1757,10 @@ class ModelRunner:
     def _get_attention_backend(self):
         """Init attention kernel backend."""
-        self.decode_attention_backend_str = (
-            self.server_args.decode_attention_backend
-            if self.server_args.decode_attention_backend
-            else self.server_args.attention_backend
-        )
-        self.prefill_attention_backend_str = (
-            self.server_args.prefill_attention_backend
-            if self.server_args.prefill_attention_backend
-            else self.server_args.attention_backend
+        self.prefill_attention_backend_str, self.decode_attention_backend_str = (
+            self.server_args.get_attention_backends()
         )
         if self.decode_attention_backend_str != self.prefill_attention_backend_str:
             from sglang.srt.layers.attention.hybrid_attn_backend import (
                 HybridAttnBackend,
@@ -2057,15 +2065,11 @@ class ModelRunner:
     def _preprocess_logits(
         self, logits_output: LogitsProcessorOutput, sampling_info: SamplingBatchInfo
     ):
-        # Apply logit bias
-        if sampling_info.sampling_info_done:
-            # Overlap mode: the function update_regex_vocab_mask was executed
-            # in process_batch_result of the last batch.
-            if sampling_info.grammars:
-                sampling_info.sampling_info_done.wait()
-        else:
-            # Normal mode: Put CPU-heavy tasks here. They will be overlapped with the forward pass.
-            sampling_info.update_regex_vocab_mask()
+        # NOTE: In overlap mode, the function update_regex_vocab_mask (in sample)
+        #       was executed after we processed last batch's results.
+        # Calculate logits bias and apply it to next_token_logits.
+        sampling_info.update_regex_vocab_mask()
         sampling_info.apply_logits_bias(logits_output.next_token_logits)
     def sample(

sglang/srt/model_loader/__init__.py CHANGED Viewed

@@ -24,7 +24,7 @@ def get_model(
     load_config: LoadConfig,
     device_config: DeviceConfig,
 ) -> nn.Module:
-    loader = get_model_loader(load_config)
+    loader = get_model_loader(load_config, model_config)
     return loader.load_model(
         model_config=model_config,
         device_config=device_config,

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -37,10 +37,22 @@ import numpy as np
 import requests
 import safetensors.torch
 import torch
+# Try to import accelerate (optional dependency)
+try:
+    from accelerate import infer_auto_device_map, init_empty_weights
+    from accelerate.utils import get_max_memory
+    HAS_ACCELERATE = True
+except ImportError:
+    HAS_ACCELERATE = False
+    infer_auto_device_map = None
+    init_empty_weights = None
+    get_max_memory = None
 from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
-from tqdm.auto import tqdm
-from transformers import AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 from sglang.srt.configs.load_config import LoadConfig, LoadFormat
@@ -54,6 +66,8 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
+from sglang.srt.layers.modelopt_utils import QUANT_CFG_CHOICES
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
     trigger_transferring_weights_request,
 )
@@ -62,6 +76,11 @@ from sglang.srt.model_loader.utils import (
     post_load_weights,
     set_default_torch_dtype,
 )
+# Constants for memory management
+DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION = (
+    0.8  # Reserve 20% GPU memory headroom for ModelOpt calibration
+)
 from sglang.srt.model_loader.weight_utils import (
     _BAR_FORMAT,
     default_weight_loader,
@@ -94,6 +113,8 @@ if TYPE_CHECKING:
     from sglang.srt.layers.quantization.base_config import QuantizationConfig
 _is_npu = is_npu()
+# ModelOpt: QUANT_CFG_CHOICES is imported from modelopt_utils.py
+# which contains the complete mapping of quantization config choices
 @contextmanager
@@ -477,12 +498,78 @@ class DefaultModelLoader(BaseModelLoader):
             model_config.model_path, model_config.revision, fall_back_to_pt=True
         )
+    def _load_modelopt_base_model(self, model_config: ModelConfig) -> nn.Module:
+        """Load and prepare the base model for ModelOpt quantization.
+        This method handles the common model loading logic shared between
+        DefaultModelLoader (conditional) and ModelOptModelLoader (dedicated).
+        """
+        if not HAS_ACCELERATE:
+            raise ImportError(
+                "accelerate is required for ModelOpt quantization. "
+                "Please install it with: pip install accelerate"
+            )
+        hf_config = AutoConfig.from_pretrained(
+            model_config.model_path, trust_remote_code=True
+        )
+        with init_empty_weights():
+            torch_dtype = getattr(hf_config, "torch_dtype", torch.float16)
+            model = AutoModelForCausalLM.from_config(
+                hf_config, torch_dtype=torch_dtype, trust_remote_code=True
+            )
+        max_memory = get_max_memory()
+        inferred_device_map = infer_auto_device_map(model, max_memory=max_memory)
+        on_cpu = "cpu" in inferred_device_map.values()
+        model_kwargs = {"torch_dtype": "auto"}
+        device_map = "auto"
+        if on_cpu:
+            for device in max_memory.keys():
+                if isinstance(device, int):
+                    max_memory[device] *= DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION
+            logger.warning(
+                "Model does not fit to the GPU mem. "
+                f"We apply the following memory limit for calibration: \n{max_memory}\n"
+                f"If you hit GPU OOM issue, please adjust the memory fraction "
+                f"(currently {DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION}) or "
+                "reduce the calibration `batch_size` manually."
+            )
+            model_kwargs["max_memory"] = max_memory
+        model = AutoModelForCausalLM.from_pretrained(
+            model_config.model_path,
+            device_map=device_map,
+            **model_kwargs,
+            trust_remote_code=True,
+        )
+        logger.info(f"ModelOpt quantization requested: {model_config.modelopt_quant}")
+        quant_choice_str = model_config.modelopt_quant
+        if not isinstance(quant_choice_str, str):
+            raise TypeError(
+                f"modelopt_quant must be a string preset key (e.g., 'fp8'), "
+                f"got {type(quant_choice_str)}"
+            )
+        return model
     def load_model(
         self,
         *,
         model_config: ModelConfig,
         device_config: DeviceConfig,
     ) -> nn.Module:
+        if hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant:
+            # Load base model using shared method
+            model = self._load_modelopt_base_model(model_config)
+            # Note: DefaultModelLoader doesn't do additional quantization processing
+            # For full ModelOpt quantization, use ModelOptModelLoader
+            return model.eval()
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
@@ -491,9 +578,9 @@ class DefaultModelLoader(BaseModelLoader):
                     self.load_config,
                 )
-        self.load_weights_and_postprocess(
-            model, self._get_all_weights(model_config, model), target_device
-        )
+            self.load_weights_and_postprocess(
+                model, self._get_all_weights(model_config, model), target_device
+            )
         return model.eval()
@@ -1668,9 +1755,103 @@ def load_model_with_cpu_quantization(
     return model.eval()
-def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+class ModelOptModelLoader(DefaultModelLoader):
+    """
+    Model loader that applies NVIDIA Model Optimizer quantization
+    """
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        # Any ModelOpt specific initialization if needed
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        logger.info("ModelOptModelLoader: Loading base model...")
+        # Use shared method from parent class to load base model
+        model = self._load_modelopt_base_model(model_config)
+        # Import ModelOpt modules (already done in _load_modelopt_base_model, but needed here for quantization)
+        try:
+            import modelopt.torch.quantization as mtq
+            from modelopt.torch.utils.dataset_utils import create_forward_loop
+        except ImportError:
+            logger.error(
+                "NVIDIA Model Optimizer (modelopt) library not found. "
+                "Please install it to use 'modelopt_quant' feature."
+            )
+            raise
+        quant_choice_str = model_config.modelopt_quant
+        quant_cfg_name = QUANT_CFG_CHOICES.get(quant_choice_str)
+        if not quant_cfg_name:
+            raise ValueError(
+                f"Invalid modelopt_quant choice: '{quant_choice_str}'. "
+                f"Available choices in QUANT_CFG_CHOICES: {list(QUANT_CFG_CHOICES.keys())}. "
+                "Ensure QUANT_CFG_CHOICES is correctly defined with mappings to "
+                "attribute names of config objects in modelopt.torch.quantization."
+            )
+        try:
+            # getattr will fetch the config object, e.g., mtq.FP8_DEFAULT_CFG
+            quant_cfg = getattr(mtq, quant_cfg_name)
+        except AttributeError:
+            raise AttributeError(
+                f"ModelOpt quantization config attribute '{quant_cfg_name}' "
+                f"(from choice '{quant_choice_str}') not found in modelopt.torch.quantization module. "
+                "Please verify QUANT_CFG_CHOICES and the ModelOpt library."
+            )
+        # For now, assume no calibration. Calibration setup is a separate, more complex step.
+        use_calibration = False  # This would ideally be a configurable parameter
+        calib_dataloader = None  # This would need to be provided/configured
+        calibrate_loop = (
+            create_forward_loop(dataloader=calib_dataloader)
+            if use_calibration
+            else None
+        )
+        if use_calibration and calib_dataloader is None:
+            logger.warning(
+                "ModelOpt calibration requested but no calib_dataloader provided. "
+                "Proceeding without calibration. Quantization accuracy may be affected."
+            )
+        logger.info(
+            f"Quantizing model with ModelOpt using config attribute: mtq.{quant_cfg_name}"
+        )
+        try:
+            model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+            logger.info("Model successfully quantized with ModelOpt.")
+        except Exception as e:
+            logger.error(f"Error during ModelOpt mtq.quantize call: {e}")
+            raise
+        mtq.print_quant_summary(model)
+        return model.eval()
+def get_model_loader(
+    load_config: LoadConfig, model_config: Optional[ModelConfig] = None
+) -> BaseModelLoader:
     """Get a model loader based on the load format."""
+    if (
+        model_config
+        and hasattr(model_config, "modelopt_quant")
+        and model_config.modelopt_quant
+    ):
+        logger.info("Using ModelOptModelLoader due to 'modelopt_quant' config.")
+        return ModelOptModelLoader(load_config)
     if isinstance(load_config.load_format, type):
         return load_config.load_format(load_config)

sglang/srt/model_loader/weight_utils.py CHANGED Viewed

@@ -226,6 +226,9 @@ def get_quant_config(
                     return ModelOptFp4Config.from_config(config)
                 else:
                     return quant_cls.from_config(config)
+        elif model_config.quantization == "modelopt_fp8":
+            if config["producer"]["name"] == "modelopt_fp8":
+                return quant_cls.from_config(config)
             else:
                 raise ValueError(
                     f"Unsupported quantization config"

sglang/srt/models/falcon_h1.py CHANGED Viewed

@@ -8,6 +8,10 @@ from torch import nn
 from sglang.srt.configs.falcon_h1 import FalconH1Config
 from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
+    HybridLinearAttnBackend,
+    Mamba2AttnBackend,
+)
 from sglang.srt.layers.attention.mamba.mamba import MambaMixer2
 from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
 from sglang.srt.layers.dp_attention import (
@@ -184,18 +188,12 @@ class FalconH1HybridAttentionDecoderLayer(nn.Module):
         )
         self.mamba = MambaMixer2(
+            cache_params=config.mamba2_cache_params,
             hidden_size=config.hidden_size,
-            ssm_state_size=config.mamba_d_state,
-            conv_kernel_size=config.mamba_d_conv,
-            intermediate_size=self.d_ssm,
             use_conv_bias=config.mamba_conv_bias,
             use_bias=config.mamba_proj_bias,
             n_groups=config.mamba_n_groups,
-            num_heads=config.mamba_n_heads,
-            layer_id=layer_id,
-            head_dim=config.mamba_d_head,
             rms_norm_eps=config.rms_norm_eps,
-            chunk_size=config.mamba_chunk_size,
             activation=config.hidden_act,
             use_rms_norm=config.mamba_rms_norm,
             prefix=f"{prefix}.mixer",
@@ -339,12 +337,16 @@ class FalconH1HybridAttentionDecoderLayer(nn.Module):
             )
             attention_hidden_states = attention_hidden_states * self.attn_out_multiplier
+            attn_backend = forward_batch.attn_backend
+            assert isinstance(attn_backend, HybridLinearAttnBackend)
+            assert isinstance(attn_backend.linear_attn_backend, Mamba2AttnBackend)
             # Mamba block
             mamba_hidden_states = torch.empty_like(hidden_states)
-            self.mamba(
+            attn_backend.linear_attn_backend.forward(
+                self.mamba,
                 hidden_states * self.ssm_in_multiplier,
                 mamba_hidden_states,
-                forward_batch=forward_batch,
+                layer_id=self.layer_id,
                 mup_vector=self.mup_vector,
             )
             mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier

sglang/srt/models/gemma3_mm.py CHANGED Viewed

@@ -16,6 +16,7 @@
 # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3_mm.py
 import logging
+import re
 from functools import lru_cache
 from typing import Dict, Iterable, List, Optional, Set, Tuple, TypedDict
@@ -154,6 +155,10 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
     embedding_modules = {}
     embedding_padding_modules = []
     supports_lora = True
+    # Pattern to match language model layers only (skip vision_tower and multi_modal_projector)
+    lora_pattern = re.compile(
+        r"^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
+    )
     def __init__(
         self,
@@ -165,6 +170,13 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
         self.config = config
         self.quant_config = quant_config
+        # For LoRA compatibility: expose text_config attributes at top level
+        # This allows LoRA code to work without special multimodal handling
+        if not hasattr(config, "num_hidden_layers"):
+            config.num_hidden_layers = config.text_config.num_hidden_layers
+        if not hasattr(config, "hidden_size"):
+            config.hidden_size = config.text_config.hidden_size
         self.vision_tower = SiglipVisionModel(
             config=config.vision_config,
             quant_config=quant_config,
@@ -380,6 +392,10 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
         return hs
+    def should_apply_lora(self, module_name: str) -> bool:
+        """Skip vision tower and multi_modal_projector for LoRA."""
+        return bool(self.lora_pattern.match(module_name))
     def tie_weights(self):
         return self.language_model.tie_weights()

sglang/srt/models/grok.py CHANGED Viewed

@@ -49,7 +49,6 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
-from sglang.srt.layers.moe.ep_moe.layer import EPMoE
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.moe.router import fused_moe_router_shim
 from sglang.srt.layers.moe.topk import TopK
@@ -176,17 +175,7 @@ class Grok1MoE(nn.Module):
             custom_routing_function=custom_routing_function,
         )
-        kwargs = {}
-        if get_moe_expert_parallel_world_size() > 1:
-            MoEImpl = EPMoE
-        else:
-            MoEImpl = FusedMoE
-            kwargs["reduce_results"] = reduce_results
-            kwargs["use_presharded_weights"] = use_presharded_weights
-            kwargs["inplace"] = inplace
-            kwargs["no_combine"] = no_combine
-        self.experts = MoEImpl(
+        self.experts = FusedMoE(
             num_experts=num_experts,
             top_k=top_k,
             layer_id=layer_id,
@@ -195,7 +184,10 @@ class Grok1MoE(nn.Module):
             params_dtype=params_dtype,
             quant_config=quant_config,
             activation="gelu",
-            **kwargs,
+            reduce_results=reduce_results,
+            use_presharded_weights=use_presharded_weights,
+            inplace=inplace,
+            no_combine=no_combine,
         )
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:

sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

sglang 0.5.3py3-none-any.whl → 0.5.3.post1py3-none-any.whl