PyPI - sglang - Versions diffs - 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl - Mend

sglang 0.1.17py3-none-any.whl → 0.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

sglang/__init__.py +2 -2
sglang/api.py +30 -4
sglang/backend/litellm.py +2 -2
sglang/backend/openai.py +26 -15
sglang/backend/runtime_endpoint.py +18 -14
sglang/bench_latency.py +317 -0
sglang/global_config.py +5 -1
sglang/lang/chat_template.py +41 -6
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +6 -2
sglang/lang/ir.py +74 -28
sglang/launch_server.py +4 -1
sglang/launch_server_llavavid.py +2 -1
sglang/srt/constrained/__init__.py +14 -6
sglang/srt/constrained/fsm_cache.py +6 -3
sglang/srt/constrained/jump_forward.py +113 -25
sglang/srt/conversation.py +2 -0
sglang/srt/flush_cache.py +2 -0
sglang/srt/hf_transformers_utils.py +68 -9
sglang/srt/layers/extend_attention.py +2 -1
sglang/srt/layers/fused_moe.py +280 -169
sglang/srt/layers/logits_processor.py +106 -42
sglang/srt/layers/radix_attention.py +53 -29
sglang/srt/layers/token_attention.py +4 -1
sglang/srt/managers/controller/dp_worker.py +6 -3
sglang/srt/managers/controller/infer_batch.py +144 -69
sglang/srt/managers/controller/manager_multi.py +5 -5
sglang/srt/managers/controller/manager_single.py +9 -4
sglang/srt/managers/controller/model_runner.py +167 -55
sglang/srt/managers/controller/radix_cache.py +4 -0
sglang/srt/managers/controller/schedule_heuristic.py +2 -0
sglang/srt/managers/controller/tp_worker.py +156 -134
sglang/srt/managers/detokenizer_manager.py +19 -21
sglang/srt/managers/io_struct.py +11 -5
sglang/srt/managers/tokenizer_manager.py +16 -14
sglang/srt/model_config.py +89 -4
sglang/srt/models/chatglm.py +399 -0
sglang/srt/models/commandr.py +2 -2
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/gemma.py +5 -1
sglang/srt/models/gemma2.py +436 -0
sglang/srt/models/grok.py +204 -137
sglang/srt/models/llama2.py +12 -5
sglang/srt/models/llama_classification.py +107 -0
sglang/srt/models/llava.py +11 -8
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +373 -0
sglang/srt/models/mixtral.py +164 -115
sglang/srt/models/mixtral_quant.py +0 -1
sglang/srt/models/qwen.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +454 -0
sglang/srt/models/stablelm.py +1 -1
sglang/srt/models/yivl.py +2 -2
sglang/srt/openai_api_adapter.py +35 -25
sglang/srt/openai_protocol.py +2 -2
sglang/srt/server.py +69 -19
sglang/srt/server_args.py +76 -43
sglang/srt/utils.py +177 -35
sglang/test/test_programs.py +28 -10
sglang/utils.py +4 -3
{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/METADATA +44 -31
sglang-0.1.19.dist-info/RECORD +81 -0
{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/WHEEL +1 -1
sglang/srt/managers/router/infer_batch.py +0 -596
sglang/srt/managers/router/manager.py +0 -82
sglang/srt/managers/router/model_rpc.py +0 -818
sglang/srt/managers/router/model_runner.py +0 -445
sglang/srt/managers/router/radix_cache.py +0 -267
sglang/srt/managers/router/scheduler.py +0 -59
sglang-0.1.17.dist-info/RECORD +0 -81
{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0

sglang/srt/models/grok.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral.py#L1
 """Inference-only Grok1 model."""
-from typing import Iterable, Optional, Tuple, List
+from typing import Iterable, List, Optional, Tuple
 import numpy as np
 import torch
@@ -9,7 +9,6 @@ import torch.nn.functional as F
 import tqdm
 from torch import nn
 from transformers import PretrainedConfig
 from vllm import _custom_ops as ops
 from vllm.config import CacheConfig
 from vllm.distributed import (
@@ -35,12 +34,11 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import print_warning_once
-from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.fused_moe import fused_moe
+from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.managers.controller.model_runner import InputMetadata
 use_fused = True
@@ -134,9 +132,12 @@ class Grok1MoEUnfused(nn.Module):
         final_hidden_states = torch.zeros(
             (hidden_states.shape[0], hidden_dim),
-            dtype=hidden_states.dtype, device=hidden_states.device
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
         )
-        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_total_experts).permute(2, 1, 0)
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_total_experts
+        ).permute(2, 1, 0)
         for expert_idx in self.expert_indicies:
             expert_layer = self.experts[expert_idx]
@@ -153,7 +154,10 @@ class Grok1MoEUnfused(nn.Module):
             # the current expert. We need to make sure to multiply the output hidden
             # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
             current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
-            current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
+            current_hidden_states = (
+                expert_layer(current_state)
+                * routing_weights[top_x_list, idx_list, None]
+            )
             # However `index_add_` only support torch tensors for indexing so we'll use
             # the `top_x` tensor here.
@@ -198,32 +202,46 @@ class Grok1MoE(nn.Module):
         self.params_dtype = params_dtype
         # Gate always runs at half / full precision for now.
-        self.gate = ReplicatedLinear(self.hidden_size,
-                                     self.num_total_experts,
-                                     bias=False,
-                                     params_dtype=self.params_dtype,
-                                     quant_config=None)
+        self.gate = ReplicatedLinear(
+            self.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=self.params_dtype,
+            quant_config=None,
+        )
         if self.use_fp8 and self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = torch.float8_e4m3fn
         self.w13_weight = nn.Parameter(
-            torch.empty(self.num_total_experts,
-                        2 * self.intermediate_size,
-                        self.hidden_size,
-                        dtype=params_dtype))
+            torch.empty(
+                self.num_total_experts,
+                2 * self.intermediate_size,
+                self.hidden_size,
+                dtype=params_dtype,
+            )
+        )
         self.w2_weight = nn.Parameter(
-            torch.empty(self.num_total_experts,
-                        self.hidden_size,
-                        self.intermediate_size,
-                        dtype=params_dtype))
-        set_weight_attrs(self.w13_weight, {
-            "weight_loader": self.weight_loader,
-        })
-        set_weight_attrs(self.w2_weight, {
-            "weight_loader": self.weight_loader,
-        })
+            torch.empty(
+                self.num_total_experts,
+                self.hidden_size,
+                self.intermediate_size,
+                dtype=params_dtype,
+            )
+        )
+        set_weight_attrs(
+            self.w13_weight,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+        set_weight_attrs(
+            self.w2_weight,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
         # Used for fp8.
         self.w13_scale = None
@@ -233,46 +251,69 @@ class Grok1MoE(nn.Module):
         if self.use_fp8:
             # WEIGHT_SCALE (for fp8)
-            self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                     dtype=torch.float32),
-                                          requires_grad=False)
-            self.w2_scale = nn.Parameter(torch.ones(self.num_total_experts,
-                                                    dtype=torch.float32),
-                                         requires_grad=False)
+            self.w13_scale = nn.Parameter(
+                torch.ones(self.num_total_experts, dtype=torch.float32),
+                requires_grad=False,
+            )
+            self.w2_scale = nn.Parameter(
+                torch.ones(self.num_total_experts, dtype=torch.float32),
+                requires_grad=False,
+            )
             # If loading fp8 checkpoint, pass the weight loaders.
             # If loading an fp16 checkpoint, do not (we will quantize in
             #   process_weights_after_loading()
             if quant_config.is_checkpoint_fp8_serialized:
-                set_weight_attrs(self.w13_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-                set_weight_attrs(self.w2_scale, {
-                    "weight_loader": self.weight_loader,
-                })
+                set_weight_attrs(
+                    self.w13_scale,
+                    {
+                        "weight_loader": self.weight_loader,
+                    },
+                )
+                set_weight_attrs(
+                    self.w2_scale,
+                    {
+                        "weight_loader": self.weight_loader,
+                    },
+                )
             # ACT_SCALE (for fp8)
             if quant_config.activation_scheme == "static":
                 if not quant_config.is_checkpoint_fp8_serialized:
                     raise ValueError(
                         "Found static activation scheme for checkpoint that "
-                        "was not serialized fp8.")
-                self.a13_scale = nn.Parameter(torch.zeros(
-                    self.num_total_experts, dtype=torch.float32),
-                                              requires_grad=False)
-                self.a2_scale = nn.Parameter(torch.zeros(
-                    self.num_total_experts, dtype=torch.float32),
-                                             requires_grad=False)
-                set_weight_attrs(self.a13_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-                set_weight_attrs(self.a2_scale, {
-                    "weight_loader": self.weight_loader,
-                })
-    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
-                      weight_name: str, expert_id: int, pre_sharded: bool):
+                        "was not serialized fp8."
+                    )
+                self.a13_scale = nn.Parameter(
+                    torch.zeros(self.num_total_experts, dtype=torch.float32),
+                    requires_grad=False,
+                )
+                self.a2_scale = nn.Parameter(
+                    torch.zeros(self.num_total_experts, dtype=torch.float32),
+                    requires_grad=False,
+                )
+                set_weight_attrs(
+                    self.a13_scale,
+                    {
+                        "weight_loader": self.weight_loader,
+                    },
+                )
+                set_weight_attrs(
+                    self.a2_scale,
+                    {
+                        "weight_loader": self.weight_loader,
+                    },
+                )
+    def weight_loader(
+        self,
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        expert_id: int,
+        pre_sharded: bool,
+    ):
         param_data = param.data
         shard_size = self.intermediate_size
         if pre_sharded:
@@ -284,8 +325,9 @@ class Grok1MoE(nn.Module):
         if weight_name.endswith("w1.weight"):
             param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :]
         if weight_name.endswith("w3.weight"):
-            param_data[expert_id,
-                       shard_size:2 * shard_size, :] = loaded_weight[shard, :]
+            param_data[expert_id, shard_size : 2 * shard_size, :] = loaded_weight[
+                shard, :
+            ]
         if weight_name.endswith("w2.weight"):
             param_data[expert_id, :, :] = loaded_weight[:, shard]
         if "act_scale" in weight_name or "weight_scale" in weight_name:
@@ -298,17 +340,17 @@ class Grok1MoE(nn.Module):
         # If checkpoint is fp16, quantize here.
         if not self.quant_config.is_checkpoint_fp8_serialized:
-            w13_weight = torch.empty_like(self.w13_weight.data,
-                                          dtype=torch.float8_e4m3fn)
-            w2_weight = torch.empty_like(self.w2_weight.data,
-                                         dtype=torch.float8_e4m3fn)
+            w13_weight = torch.empty_like(
+                self.w13_weight.data, dtype=torch.float8_e4m3fn
+            )
+            w2_weight = torch.empty_like(self.w2_weight.data, dtype=torch.float8_e4m3fn)
             for expert in range(self.num_total_experts):
-                w13_weight[expert, :, :], self.w13_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        self.w13_weight.data[expert, :, :])
-                w2_weight[expert, :, :], self.w2_scale[
-                    expert] = ops.scaled_fp8_quant(
-                        self.w2_weight.data[expert, :, :])
+                w13_weight[expert, :, :], self.w13_scale[expert] = ops.scaled_fp8_quant(
+                    self.w13_weight.data[expert, :, :]
+                )
+                w2_weight[expert, :, :], self.w2_scale[expert] = ops.scaled_fp8_quant(
+                    self.w2_weight.data[expert, :, :]
+                )
             self.w13_weight = nn.Parameter(w13_weight, requires_grad=False)
             self.w2_weight = nn.Parameter(w2_weight, requires_grad=False)
@@ -319,40 +361,40 @@ class Grok1MoE(nn.Module):
             if self.a13_scale is None or self.a2_scale is None:
                 raise ValueError(
                     "QuantConfig has static quantization, but found "
-                    "activation scales are None.")
+                    "activation scales are None."
+                )
-            if (not all_close_1d(self.a13_scale)
-                    or not all_close_1d(self.a2_scale)):
+            if not all_close_1d(self.a13_scale) or not all_close_1d(self.a2_scale):
                 print_warning_once(
                     "Found act_scales that are not equal for fp8 MoE layer. "
-                    "Using the maximum across experts for each layer. ")
+                    "Using the maximum across experts for each layer. "
+                )
-            self.a13_scale = nn.Parameter(self.a13_scale.max(),
-                                          requires_grad=False)
-            self.a2_scale = nn.Parameter(self.a2_scale.max(),
-                                         requires_grad=False)
+            self.a13_scale = nn.Parameter(self.a13_scale.max(), requires_grad=False)
+            self.a2_scale = nn.Parameter(self.a2_scale.max(), requires_grad=False)
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = fused_moe(hidden_states,
-                                        self.w13_weight,
-                                        self.w2_weight,
-                                        router_logits,
-                                        self.top_k,
-                                        renormalize=False,
-                                        inplace=True,
-                                        use_fp8=self.use_fp8,
-                                        w1_scale=self.w13_scale,
-                                        w2_scale=self.w2_scale,
-                                        a1_scale=self.a13_scale,
-                                        a2_scale=self.a2_scale)
+        final_hidden_states = fused_moe(
+            hidden_states,
+            self.w13_weight,
+            self.w2_weight,
+            router_logits,
+            self.top_k,
+            renormalize=False,
+            inplace=True,
+            use_fp8=self.use_fp8,
+            w1_scale=self.w13_scale,
+            w2_scale=self.w2_scale,
+            a1_scale=self.a13_scale,
+            a2_scale=self.a2_scale,
+        )
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states.view(num_tokens, hidden_size)
@@ -462,10 +504,12 @@ class Grok1DecoderLayer(nn.Module):
                 top_k=config.num_experts_per_tok,
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
-                quant_config=quant_config)
+                quant_config=quant_config,
+            )
         else:
             self.block_sparse_moe = Grok1MoEUnfused(
-                config=config, quant_config=quant_config)
+                config=config, quant_config=quant_config
+            )
         self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.pre_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -477,13 +521,21 @@ class Grok1DecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
+        hidden_states = (
+            self.post_attn_norm(
+                self.self_attn(
+                    positions=positions,
+                    hidden_states=self.pre_attn_norm(hidden_states),
+                    input_metadata=input_metadata,
+                )
+            )
+            + hidden_states
+        )
-        hidden_states = self.post_attn_norm(self.self_attn(
-            positions=positions, hidden_states=self.pre_attn_norm(hidden_states),
-            input_metadata=input_metadata,
-        )) + hidden_states
-        hidden_states = self.post_moe_norm(self.block_sparse_moe(self.pre_moe_norm(hidden_states))) + hidden_states
+        hidden_states = (
+            self.post_moe_norm(self.block_sparse_moe(self.pre_moe_norm(hidden_states)))
+            + hidden_states
+        )
         return hidden_states
@@ -525,9 +577,7 @@ class Grok1Model(nn.Module):
         hidden_states.mul_(self.config.embedding_multiplier_scale)
         for i in range(len(self.layers)):
-            hidden_states = self.layers[i](
-                positions, hidden_states, input_metadata
-            )
+            hidden_states = self.layers[i](positions, hidden_states, input_metadata)
         hidden_states = self.norm(hidden_states)
         hidden_states.mul_(self.config.output_multiplier_scale)
@@ -572,28 +622,41 @@ class Grok1ModelForCausalLM(nn.Module):
         ]
         if use_fused:
-            expert_params_mapping = [
-                # These are the weight scales for the experts
-                # (param_name, weight_name, expert_id)
-                ("w13_scale" if weight_name in ["w1", "w3"] else "w2_scale",
-                f"experts.{expert_id}.{weight_name}.weight_scale", expert_id)
-                for expert_id in range(self.config.num_local_experts)
-                for weight_name in ["w1", "w2", "w3"]
-            ] + [
-                # These are the weights for the experts
-                # (param_name, weight_name, expert_id)
-                ("w13_weight" if weight_name in ["w1", "w3"] else "w2_weight",
-                f"experts.{expert_id}.{weight_name}.weight", expert_id)
-                for expert_id in range(self.config.num_local_experts)
-                for weight_name in ["w1", "w2", "w3"]
-            ] + [
-                # These are the activation scales for the experts
-                # (param_name, weight_name, expert_id)
-                ("a13_scale" if weight_name in ["w1", "w3"] else "a2_scale",
-                f"experts.{expert_id}.{weight_name}.act_scale", expert_id)
-                for expert_id in range(self.config.num_local_experts)
-                for weight_name in ["w1", "w2", "w3"]
-            ]
+            expert_params_mapping = (
+                [
+                    # These are the weight scales for the experts
+                    # (param_name, weight_name, expert_id)
+                    (
+                        "w13_scale" if weight_name in ["w1", "w3"] else "w2_scale",
+                        f"experts.{expert_id}.{weight_name}.weight_scale",
+                        expert_id,
+                    )
+                    for expert_id in range(self.config.num_local_experts)
+                    for weight_name in ["w1", "w2", "w3"]
+                ]
+                + [
+                    # These are the weights for the experts
+                    # (param_name, weight_name, expert_id)
+                    (
+                        "w13_weight" if weight_name in ["w1", "w3"] else "w2_weight",
+                        f"experts.{expert_id}.{weight_name}.weight",
+                        expert_id,
+                    )
+                    for expert_id in range(self.config.num_local_experts)
+                    for weight_name in ["w1", "w2", "w3"]
+                ]
+                + [
+                    # These are the activation scales for the experts
+                    # (param_name, weight_name, expert_id)
+                    (
+                        "a13_scale" if weight_name in ["w1", "w3"] else "a2_scale",
+                        f"experts.{expert_id}.{weight_name}.act_scale",
+                        expert_id,
+                    )
+                    for expert_id in range(self.config.num_local_experts)
+                    for weight_name in ["w1", "w2", "w3"]
+                ]
+            )
         else:
             expert_params_mapping = []
@@ -601,11 +664,11 @@ class Grok1ModelForCausalLM(nn.Module):
         if get_tensor_model_parallel_rank() == 0:
             weights = tqdm.tqdm(weights, total=int(len(params_dict) * 3.4))
         for name, loaded_weight in weights:
-            #print(get_tensor_model_parallel_rank(), name)
+            # print(get_tensor_model_parallel_rank(), name)
             if "rotary_emb.inv_freq" in name:
                 continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -623,19 +686,22 @@ class Grok1ModelForCausalLM(nn.Module):
                     name = name.replace(weight_name, param_name)
                     param = params_dict[name]
                     weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  weight_name,
-                                  expert_id=expert_id,
-                                  pre_sharded=get_tensor_model_parallel_world_size() > 1)
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        weight_name,
+                        expert_id=expert_id,
+                        pre_sharded=get_tensor_model_parallel_world_size() > 1,
+                    )
                     break
                 else:
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
                     param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
                     weight_loader(param, loaded_weight)
@@ -645,10 +711,11 @@ def all_close_1d(x: torch.Tensor) -> bool:
 old_prepare_weights = getattr(DefaultModelLoader, "_prepare_weights")
-def _prepare_presharded_weights(self,
-                                model_name_or_path: str,
-                                revision: Optional[str],
-                                fall_back_to_pt: bool) -> Tuple[str, List[str], bool]:
+def _prepare_presharded_weights(
+    self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+) -> Tuple[str, List[str], bool]:
     import glob
     import os
@@ -668,4 +735,4 @@ def _prepare_presharded_weights(self,
     return hf_folder, hf_weights_files, use_safetensors
-EntryClass = Grok1ModelForCausalLM
+EntryClass = Grok1ModelForCausalLM

sglang/srt/models/llama2.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Optional, Tuple, Iterable
+from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 import tqdm
@@ -10,7 +11,7 @@ from transformers import LlamaConfig
 from vllm.config import CacheConfig
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size
+    get_tensor_model_parallel_world_size,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -76,6 +77,7 @@ class LlamaAttention(nn.Module):
         layer_id: int = 0,
         rope_theta: float = 10000,
         rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
         max_position_embeddings: int = 8192,
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
@@ -123,6 +125,7 @@ class LlamaAttention(nn.Module):
             max_position=max_position_embeddings,
             base=rope_theta,
             rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
         )
         self.attn = RadixAttention(
             self.num_heads,
@@ -158,9 +161,12 @@ class LlamaDecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
-                config, "original_max_position_embeddings", None):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings)
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling[
+                "original_max_position_embeddings"
+            ] = config.original_max_position_embeddings
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = LlamaAttention(
             hidden_size=self.hidden_size,
@@ -169,6 +175,7 @@ class LlamaDecoderLayer(nn.Module):
             layer_id=layer_id,
             rope_theta=rope_theta,
             rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
         )

sglang/srt/models/llama_classification.py ADDED Viewed

@@ -0,0 +1,107 @@
+from typing import Iterable, Optional, Tuple
+import torch
+import tqdm
+from torch import nn
+from transformers import LlamaConfig
+from vllm.config import CacheConfig
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from sglang.srt.layers.logits_processor import LogitProcessorOutput
+from sglang.srt.managers.controller.model_runner import InputMetadata
+from sglang.srt.models.llama2 import LlamaModel
+class LlamaForClassification(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = LlamaModel(config, quant_config=quant_config)
+        self.classification_head = nn.Linear(
+            config.hidden_size, config.classification_out_size
+        )
+        self.eos_token_id = config.eos_token_id
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        input_metadata: InputMetadata,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
+        is_eos_token = input_ids == self.eos_token_id
+        hidden_states = hidden_states[is_eos_token]
+        scores = self.classification_head(hidden_states)
+        if scores.shape[0] != input_metadata.batch_size:
+            print("Warning: the EOS tokens are missing in some sentences.")
+            scores = torch.ones(
+                (input_metadata.batch_size, self.config.classification_out_size)
+            ).to(input_ids.device)
+        return LogitProcessorOutput(
+            next_token_logits=scores,
+            next_token_logprobs=scores,
+            normalized_prompt_logprobs=scores,
+            prefill_token_logprobs=torch.ones_like(input_ids),
+            prefill_top_logprobs=None,
+            decode_top_logprobs=None,
+        )
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        if get_tensor_model_parallel_rank() == 0:
+            weights = tqdm.tqdm(weights, total=int(len(params_dict) * 1.5))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if "lm_head" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name.startswith("model.vision_tower") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name.startswith("model.vision_tower") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+EntryClass = LlamaForClassification

sglang 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

sglang 0.1.17py3-none-any.whl → 0.1.19py3-none-any.whl