PyPI - sglang - Versions diffs - 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

sglang/srt/models/deepseek.py CHANGED Viewed

@@ -46,6 +46,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
 class DeepseekMLP(nn.Module):
@@ -57,10 +58,15 @@ class DeepseekMLP(nn.Module):
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
         )
         self.down_proj = RowParallelLinear(
             intermediate_size,
@@ -68,6 +74,7 @@ class DeepseekMLP(nn.Module):
             bias=False,
             quant_config=quant_config,
             reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
         )
         if hidden_act != "silu":
             raise ValueError(
@@ -89,6 +96,7 @@ class DeepseekMoE(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -110,6 +118,7 @@ class DeepseekMoE(nn.Module):
                     hidden_act=config.hidden_act,
                     quant_config=quant_config,
                     reduce_results=False,
+                    prefix=add_prefix(f"{idx}.experts", prefix),
                 )
                 for idx in range(self.n_routed_experts)
             ]
@@ -117,7 +126,11 @@ class DeepseekMoE(nn.Module):
         self.pack_params()
         self.gate = ReplicatedLinear(
-            config.hidden_size, self.n_routed_experts, bias=False, quant_config=None
+            config.hidden_size,
+            self.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
         )
         if config.n_shared_experts is not None:
@@ -128,6 +141,7 @@ class DeepseekMoE(nn.Module):
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
             )
     def pack_params(self):
@@ -185,6 +199,7 @@ class DeepseekAttention(nn.Module):
         rope_scaling: Optional[Dict[str, Any]] = None,
         max_position_embeddings: int = 8192,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -216,6 +231,7 @@ class DeepseekAttention(nn.Module):
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
         )
         self.o_proj = RowParallelLinear(
@@ -223,6 +239,7 @@ class DeepseekAttention(nn.Module):
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
         )
         self.rotary_emb = get_rope(
@@ -238,6 +255,7 @@ class DeepseekAttention(nn.Module):
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
         )
     def forward(
@@ -261,6 +279,7 @@ class DeepseekDecoderLayer(nn.Module):
         config: PretrainedConfig,
         layer_id: int,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -276,19 +295,25 @@ class DeepseekDecoderLayer(nn.Module):
             rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
         )
         if (
             config.n_routed_experts is not None
             and layer_id >= config.first_k_dense_replace
             and layer_id % config.moe_layer_freq == 0
         ):
-            self.mlp = DeepseekMoE(config=config, quant_config=quant_config)
+            self.mlp = DeepseekMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
         else:
             self.mlp = DeepseekMLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
             )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
@@ -328,6 +353,7 @@ class DeepseekModel(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
@@ -339,7 +365,12 @@ class DeepseekModel(nn.Module):
         )
         self.layers = nn.ModuleList(
             [
-                DeepseekDecoderLayer(config, layer_id, quant_config=quant_config)
+                DeepseekDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                )
                 for layer_id in range(config.num_hidden_layers)
             ]
         )
@@ -368,13 +399,19 @@ class DeepseekForCausalLM(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.model = DeepseekModel(config, quant_config)
+        self.model = DeepseekModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
         self.lm_head = ParallelLMHead(
-            config.vocab_size, config.hidden_size, quant_config=quant_config
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
         )
         self.logits_processor = LogitsProcessor(config)

sglang/srt/models/deepseek_nextn.py CHANGED Viewed

@@ -38,7 +38,7 @@ from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.deepseek_v2 import DeepseekV2DecoderLayer, DeepseekV3ForCausalLM
-from sglang.srt.utils import is_hip
+from sglang.srt.utils import add_prefix, is_hip
 is_hip_ = is_hip()
@@ -48,6 +48,7 @@ class DeepseekModelNextN(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.vocab_size = config.vocab_size
@@ -56,6 +57,7 @@ class DeepseekModelNextN(nn.Module):
             config.vocab_size,
             config.hidden_size,
             enable_tp=not global_server_args_dict["enable_dp_attention"],
+            prefix=add_prefix("embed_tokens", prefix),
         )
         self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -64,7 +66,11 @@ class DeepseekModelNextN(nn.Module):
         self.eh_proj = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
         self.decoder = DeepseekV2DecoderLayer(
-            config, 0, quant_config=quant_config, is_nextn=True
+            config,
+            0,
+            quant_config=quant_config,
+            is_nextn=True,
+            prefix=add_prefix("decoder", prefix),
         )
         self.shared_head = nn.Module()
@@ -108,25 +114,30 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         nn.Module.__init__(self)
         self.config = config
         self.quant_config = quant_config
-        self.model = DeepseekModelNextN(config, quant_config)
+        self.model = DeepseekModelNextN(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
         if global_server_args_dict["enable_dp_attention"]:
-            self.model.shared_head.head = ReplicatedLinear(
+            self.lm_head = ReplicatedLinear(
                 config.hidden_size,
                 config.vocab_size,
                 bias=False,
+                prefix=add_prefix("model.shared_head.head", prefix),
             )
             self.logits_processor = LogitsProcessor(config, skip_all_gather=True)
         else:
-            self.model.shared_head.head = ParallelLMHead(
+            self.lm_head = ParallelLMHead(
                 config.vocab_size,
                 config.hidden_size,
                 quant_config=quant_config,
+                prefix=add_prefix("model.shared_head.head", prefix),
             )
             self.logits_processor = LogitsProcessor(config)
@@ -139,7 +150,7 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, forward_batch)
         return self.logits_processor(
-            input_ids, hidden_states, self.model.shared_head.head, forward_batch
+            input_ids, hidden_states, self.lm_head, forward_batch
         )
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
@@ -168,10 +179,8 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
         nextn_layer_prefix = "model.layers.0"
         nextn_spec_weight_names = [
-            "shared_head.head",
             "shared_head.norm",
             "eh_proj",
-            "embed_tokens",
             "enorm",
             "hnorm",
         ]
@@ -180,17 +189,21 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
         for name, loaded_weight in weights:
             if not name.startswith(nextn_layer_prefix):
                 continue
-            else:
-                is_decoder = True
-                # For nextn specific weights
-                for weight_name in nextn_spec_weight_names:
-                    if weight_name in name:
-                        name = name.replace(nextn_layer_prefix, "model")
-                        is_decoder = False
-                        break
-                # For decoder layer weights
-                if is_decoder:
-                    name = name.replace(nextn_layer_prefix, "model.decoder")
+            # Use shared head and embed weights from target model
+            if "shared_head.head" in name or "embed_tokens" in name:
+                continue
+            is_decoder = True
+            # For nextn specific weights
+            for weight_name in nextn_spec_weight_names:
+                if weight_name in name:
+                    name = name.replace(nextn_layer_prefix, "model")
+                    is_decoder = False
+                    break
+            # For decoder layer weights
+            if is_decoder:
+                name = name.replace(nextn_layer_prefix, "model.decoder")
             if "rotary_emb.inv_freq" in name:
                 continue

sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl