PyPI - sglang - Versions diffs - 0.4.1.post4__py3-none-any.whl → 0.4.1.post6__py3-none-any.whl - Mend

sglang 0.4.1.post4py3-none-any.whl → 0.4.1.post6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sglang/bench_serving.py +18 -1
sglang/lang/interpreter.py +71 -1
sglang/lang/ir.py +2 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/chatglm.py +78 -0
sglang/srt/configs/dbrx.py +279 -0
sglang/srt/configs/model_config.py +16 -7
sglang/srt/hf_transformers_utils.py +9 -14
sglang/srt/layers/attention/__init__.py +8 -1
sglang/srt/layers/attention/flashinfer_backend.py +21 -5
sglang/srt/layers/linear.py +89 -47
sglang/srt/layers/logits_processor.py +6 -6
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +16 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +39 -12
sglang/srt/layers/moe/topk.py +4 -2
sglang/srt/layers/parameter.py +439 -0
sglang/srt/layers/quantization/__init__.py +5 -2
sglang/srt/layers/quantization/fp8.py +107 -53
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/int8_kernel.py +54 -0
sglang/srt/layers/quantization/modelopt_quant.py +174 -0
sglang/srt/layers/quantization/w8a8_int8.py +117 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/vocab_parallel_embedding.py +16 -3
sglang/srt/managers/cache_controller.py +307 -0
sglang/srt/managers/configure_logging.py +43 -0
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/detokenizer_manager.py +0 -2
sglang/srt/managers/io_struct.py +29 -13
sglang/srt/managers/schedule_batch.py +7 -1
sglang/srt/managers/scheduler.py +58 -15
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +109 -45
sglang/srt/mem_cache/memory_pool.py +313 -53
sglang/srt/metrics/collector.py +32 -35
sglang/srt/model_executor/cuda_graph_runner.py +14 -7
sglang/srt/model_executor/forward_batch_info.py +20 -15
sglang/srt/model_executor/model_runner.py +53 -10
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/grok.py +25 -16
sglang/srt/models/llama.py +46 -4
sglang/srt/models/qwen2.py +11 -0
sglang/srt/models/qwen2_eagle.py +131 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
sglang/srt/sampling/sampling_batch_info.py +15 -5
sglang/srt/sampling/sampling_params.py +1 -1
sglang/srt/server.py +125 -69
sglang/srt/server_args.py +39 -19
sglang/srt/speculative/eagle_utils.py +93 -85
sglang/srt/speculative/eagle_worker.py +48 -33
sglang/srt/torch_memory_saver_adapter.py +59 -0
sglang/srt/utils.py +61 -5
sglang/test/test_programs.py +23 -1
sglang/test/test_utils.py +36 -7
sglang/version.py +1 -1
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/METADATA +16 -15
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/RECORD +61 -51
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/WHEEL +1 -1
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post6.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -50,10 +50,12 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader import get_model
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
 from sglang.srt.utils import (
     enable_show_time_cost,
     get_available_gpu_memory,
     init_custom_process_group,
+    is_cuda,
     is_hip,
     monkey_patch_vllm_gguf_config,
     monkey_patch_vllm_p2p_access_check,
@@ -89,6 +91,7 @@ class ModelRunner:
         self.is_draft_worker = is_draft_worker
         self.is_generation = model_config.is_generation
         self.is_multimodal = model_config.is_multimodal
+        self.should_log = tp_rank == 0
         self.spec_algorithm = SpeculativeAlgorithm.from_string(
             server_args.speculative_algorithm
         )
@@ -117,15 +120,21 @@ class ModelRunner:
         if self.is_multimodal:
             self.mem_fraction_static *= 0.95
+            logger.info(
+                f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
+                f"because this is a multimodal model."
+            )
             if self.model_config.hf_config.architectures == [
                 "MllamaForConditionalGeneration"
             ]:
                 logger.info("Automatically turn off --chunked-prefill-size for mllama.")
                 server_args.chunked_prefill_size = -1
-            # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
             if self.model_config.hf_config.architectures == [
                 "Qwen2VLForConditionalGeneration"
             ]:
+                # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
                 logger.info(
                     "Automatically turn off --chunked-prefill-size and disable radix cache for qwen2-vl."
                 )
@@ -158,6 +167,10 @@ class ModelRunner:
         # Get memory before model loading
         min_per_gpu_memory = self.init_torch_distributed()
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=self.server_args.enable_memory_saver
+        )
         # Load the model
         self.sampler = Sampler()
         self.load_model()
@@ -198,7 +211,7 @@ class ModelRunner:
         if self.device == "cuda":
             backend = "nccl"
         elif self.device == "xpu":
-            # TODO(liangan1):Just use gloo to bypass the initilization fail
+            # TODO(liangan1): Just use gloo to bypass the initilization fail
             # Need to use xccl for xpu backend in the future
             backend = "gloo"
         elif self.device == "hpu":
@@ -264,11 +277,35 @@ class ModelRunner:
             monkey_patch_vllm_gguf_config()
         # Load the model
-        self.model = get_model(
-            model_config=self.model_config,
-            load_config=self.load_config,
-            device_config=DeviceConfig(self.device),
-        )
+        with self.memory_saver_adapter.region():
+            self.model = get_model(
+                model_config=self.model_config,
+                load_config=self.load_config,
+                device_config=DeviceConfig(self.device),
+            )
+        if self.server_args.kv_cache_dtype == "fp8_e4m3":
+            if self.server_args.quantization_param_path is not None:
+                if callable(getattr(self.model, "load_kv_cache_scales", None)):
+                    self.model.load_kv_cache_scales(
+                        self.server_args.quantization_param_path
+                    )
+                    logger.info(
+                        "Loaded KV cache scaling factors from %s",
+                        self.server_args.quantization_param_path,
+                    )
+                else:
+                    raise RuntimeError(
+                        "Using FP8 KV cache and scaling factors provided but "
+                        "model %s does not support loading scaling factors.",
+                        self.model.__class__,
+                    )
+            else:
+                logger.warning(
+                    "Using FP8 KV cache but no scaling factors "
+                    "provided. Defaulting to scaling factors of 1.0. "
+                    "This may lead to less accurate results!"
+                )
         # Parse other args
         self.sliding_window_size = (
@@ -386,7 +423,7 @@ class ModelRunner:
         logger.info(
             f"init custom process group: master_address={master_address}, master_port={master_port}, "
-            f"rank_offset={rank_offset}, world_size={world_size}, group_name={group_name}, backend={backend}"
+            f"rank_offset={rank_offset}, rank={rank}, world_size={world_size}, group_name={group_name}, backend={backend}"
         )
         try:
@@ -509,6 +546,9 @@ class ModelRunner:
                 self.kv_cache_dtype = torch.float8_e5m2fnuz
             else:
                 self.kv_cache_dtype = torch.float8_e5m2
+        elif self.server_args.kv_cache_dtype == "fp8_e4m3":
+            if is_cuda():
+                self.kv_cache_dtype = torch.float8_e4m3fn
         else:
             raise ValueError(
                 f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}."
@@ -556,6 +596,7 @@ class ModelRunner:
             max_context_len=self.model_config.context_len + 4,
             device=self.device,
             use_records=False,
+            enable_memory_saver=self.server_args.enable_memory_saver,
         )
         if (
             self.model_config.attention_arch == AttentionArch.MLA
@@ -568,6 +609,7 @@ class ModelRunner:
                 qk_rope_head_dim=self.model_config.qk_rope_head_dim,
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
+                enable_memory_saver=self.server_args.enable_memory_saver,
             )
         elif self.server_args.enable_double_sparsity:
             self.token_to_kv_pool = DoubleSparseTokenToKVPool(
@@ -578,6 +620,7 @@ class ModelRunner:
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
                 heavy_channel_num=self.server_args.ds_heavy_channel_num,
+                enable_memory_saver=self.server_args.enable_memory_saver,
             )
         else:
             self.token_to_kv_pool = MHATokenToKVPool(
@@ -587,6 +630,7 @@ class ModelRunner:
                 head_dim=self.model_config.head_dim,
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
+                enable_memory_saver=self.server_args.enable_memory_saver,
             )
         logger.info(
             f"Memory pool end. "
@@ -627,7 +671,6 @@ class ModelRunner:
             )
     def init_double_sparsity_channel_config(self, selected_channel):
         selected_channel = "." + selected_channel + "_proj"
         self.sorted_channels = []
         # load channel config
@@ -718,7 +761,7 @@ class ModelRunner:
         elif forward_batch.forward_mode.is_idle():
             return self.forward_idle(forward_batch)
         else:
-            raise ValueError(f"Invaid forward mode: {forward_batch.forward_mode}")
+            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
     def sample(
         self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch

sglang/srt/models/chatglm.py CHANGED Viewed

@@ -23,8 +23,8 @@ from torch import nn
 from torch.nn import LayerNorm
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.transformers_utils.configs import ChatGLMConfig
+from sglang.srt.configs import ChatGLMConfig
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (

sglang/srt/models/dbrx.py CHANGED Viewed

@@ -25,8 +25,8 @@ from vllm.distributed import (
     tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.transformers_utils.configs.dbrx import DbrxConfig
+from sglang.srt.configs import DbrxConfig
 from sglang.srt.layers.linear import (
     QKVParallelLinear,
     ReplicatedLinear,

sglang/srt/models/grok.py CHANGED Viewed

@@ -57,6 +57,7 @@ class Grok1MLP(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         reduce_results=True,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -65,6 +66,7 @@ class Grok1MLP(nn.Module):
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
+            use_presharded_weights=use_presharded_weights,
         )
         self.down_proj = RowParallelLinear(
             intermediate_size,
@@ -73,6 +75,7 @@ class Grok1MLP(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.down_proj",
             reduce_results=reduce_results,
+            use_presharded_weights=use_presharded_weights,
         )
         self.act_fn = GeluAndMul(approximate="tanh")
@@ -103,6 +106,7 @@ class Grok1MoE(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         reduce_results=True,
+        use_presharded_weights: bool = False,
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -129,6 +133,7 @@ class Grok1MoE(nn.Module):
             renormalize=False,
             quant_config=quant_config,
             tp_size=tp_size,
+            use_presharded_weights=use_presharded_weights,
         )
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -156,6 +161,7 @@ class Grok1Attention(nn.Module):
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
         quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
     ) -> None:
         super().__init__()
         self.config = config
@@ -194,6 +200,7 @@ class Grok1Attention(nn.Module):
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            reduce_results=reduce_results,
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -234,10 +241,12 @@ class Grok1DecoderLayer(nn.Module):
         config: PretrainedConfig,
         layer_id: int = 0,
         quant_config: Optional[QuantizationConfig] = None,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.num_experts = config.num_local_experts
         self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
         rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = Grok1Attention(
@@ -262,6 +271,7 @@ class Grok1DecoderLayer(nn.Module):
             ),
             quant_config=quant_config,
             reduce_results=True,
+            use_presharded_weights=use_presharded_weights,
         )
         self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -299,6 +309,7 @@ class Grok1Model(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.config = config
@@ -311,7 +322,12 @@ class Grok1Model(nn.Module):
         )
         self.layers = nn.ModuleList(
             [
-                Grok1DecoderLayer(config, i, quant_config=quant_config)
+                Grok1DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    use_presharded_weights=use_presharded_weights,
+                )
                 for i in range(config.num_hidden_layers)
             ]
         )
@@ -347,11 +363,7 @@ class Grok1ForCausalLM(nn.Module):
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.model = Grok1Model(config, quant_config=quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.logits_processor = LogitsProcessor(config)
-        # Monkey patch _prepare_weights to load pre-sharded weights
         if (
             self.config.num_local_experts > 0
             and get_tensor_model_parallel_world_size() > 1
@@ -361,6 +373,14 @@ class Grok1ForCausalLM(nn.Module):
         else:
             self.use_presharded_weights = False
+        self.model = Grok1Model(
+            config,
+            quant_config=quant_config,
+            use_presharded_weights=self.use_presharded_weights,
+        )
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config)
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -376,10 +396,7 @@ class Grok1ForCausalLM(nn.Module):
     def load_weights(
         self,
         weights: Iterable[Tuple[str, torch.Tensor]],
-        use_presharded_weights: bool | None = None,
     ):
-        if use_presharded_weights is None:
-            use_presharded_weights = self.use_presharded_weights
         num_experts = self.config.num_local_experts
         stacked_params_mapping = [
@@ -435,20 +452,12 @@ class Grok1ForCausalLM(nn.Module):
                         continue
                     name = name.replace(weight_name, param_name)
-                    if use_presharded_weights:
-                        extra_kwargs = {
-                            "use_presharded_weights": use_presharded_weights
-                        }
-                    else:
-                        extra_kwargs = {}
                     load_weight_wrapper(
                         name,
                         loaded_weight,
                         name,
                         shard_id=shard_id,
                         expert_id=expert_id,
-                        **extra_kwargs,
                     )
                     break
                 else:

sglang/srt/models/llama.py CHANGED Viewed

@@ -22,8 +22,12 @@ from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import LlamaConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.model_loader.weight_utils import kv_cache_scales_loader
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
@@ -100,6 +104,7 @@ class LlamaAttention(nn.Module):
         max_position_embeddings: int = 8192,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        bias: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -132,14 +137,14 @@ class LlamaAttention(nn.Module):
             self.head_dim,
             self.total_num_heads,
             self.total_num_kv_heads,
-            bias=False,
+            bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
-            bias=False,
+            bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
@@ -194,6 +199,11 @@ class LlamaDecoderLayer(nn.Module):
             )
         rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support llamafy/Qwen-Qwen2.5-7B-Instruct-llamafied with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
         self.self_attn = LlamaAttention(
             config=config,
             hidden_size=self.hidden_size,
@@ -206,6 +216,7 @@ class LlamaDecoderLayer(nn.Module):
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
+            bias=attention_bias,
         )
         self.mlp = LlamaMLP(
             hidden_size=self.hidden_size,
@@ -292,6 +303,30 @@ class LlamaModel(nn.Module):
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
 class LlamaForCausalLM(nn.Module):
@@ -527,9 +562,16 @@ class LlamaForCausalLM(nn.Module):
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
 class Phi3ForCausalLM(LlamaForCausalLM):
     pass
-EntryClass = [LlamaForCausalLM, Phi3ForCausalLM]
+class InternLM3ForCausalLM(LlamaForCausalLM):
+    pass
+EntryClass = [LlamaForCausalLM, Phi3ForCausalLM, InternLM3ForCausalLM]

sglang/srt/models/qwen2.py CHANGED Viewed

@@ -362,5 +362,16 @@ class Qwen2ForCausalLM(nn.Module):
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
 EntryClass = Qwen2ForCausalLM

sglang/srt/models/qwen2_eagle.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Tuple
+import torch
+from torch import nn
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2ForCausalLM
+Qwen2Config = None
+class Qwen2DecoderLayer(Qwen2DecoderLayer):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config)
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if layer_id == 0:
+            del self.input_layernorm
+            setattr(self, "input_layernorm", lambda x: x)
+class Qwen2Model(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen2DecoderLayer(
+                    config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        hidden_states = self.fc(
+            torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
+        )
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        return hidden_states + residual
+class Qwen2ForCausalLMEagle(Qwen2ForCausalLM):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config=None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2Model(config, quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=quant_config
+            )
+        self.logits_processor = LogitsProcessor(config)
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+                super().load_weights([(name, loaded_weight)])
+EntryClass = [Qwen2ForCausalLMEagle]

sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py CHANGED Viewed

@@ -3,6 +3,11 @@ from typing import List
 import torch
 from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.utils import is_cuda_available
+is_cuda = is_cuda_available()
+if is_cuda:
+    from sgl_kernel import sampling_scaling_penalties
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -56,11 +61,16 @@ class BatchedRepetitionPenalizer(_BatchedPenalizer):
         self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
     def _apply(self, logits: torch.Tensor) -> torch.Tensor:
-        return torch.where(
-            logits > 0,
-            logits / self.cumulated_repetition_penalties,
-            logits * self.cumulated_repetition_penalties,
-        )
+        if is_cuda:
+            return sampling_scaling_penalties(
+                logits, self.cumulated_repetition_penalties
+            )
+        else:
+            return torch.where(
+                logits > 0,
+                logits / self.cumulated_repetition_penalties,
+                logits * self.cumulated_repetition_penalties,
+            )
     def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -7,6 +7,12 @@ from typing import TYPE_CHECKING, Callable, List, Optional
 import torch
+from sglang.srt.utils import is_cuda_available
+is_cuda = is_cuda_available()
+if is_cuda:
+    from sgl_kernel import sampling_scaling_penalties
 import sglang.srt.sampling.penaltylib as penaltylib
 logger = logging.getLogger(__name__)
@@ -232,6 +238,7 @@ class SamplingBatchInfo:
         self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
             self.logit_bias, other.logit_bias, len(self), len(other), self.device
         )
+        self.need_min_p_sampling = self.need_min_p_sampling or other.need_min_p_sampling
     def apply_logits_bias(self, logits: torch.Tensor):
         # Apply logit_bias
@@ -244,11 +251,14 @@ class SamplingBatchInfo:
         # repetition
         if self.scaling_penalties is not None:
-            logits[:] = torch.where(
-                logits > 0,
-                logits / self.scaling_penalties,
-                logits * self.scaling_penalties,
-            )
+            if is_cuda:
+                logits[:] = sampling_scaling_penalties(logits, self.scaling_penalties)
+            else:
+                logits[:] = torch.where(
+                    logits > 0,
+                    logits / self.scaling_penalties,
+                    logits * self.scaling_penalties,
+                )
         # Apply regex vocab_mask
         if self.vocab_mask is not None:

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -23,7 +23,7 @@ class SamplingParams:
     The sampling parameters.
     See docs/references/sampling_params.md or
-    https://sgl-project.github.io/references/sampling_params.html
+    https://docs.sglang.ai/references/sampling_params.html
     for the documentation.
     """

sglang 0.4.1.post4__py3-none-any.whl → 0.4.1.post6__py3-none-any.whl

sglang 0.4.1.post4py3-none-any.whl → 0.4.1.post6py3-none-any.whl