PyPI - sglang - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

sglang 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

sglang/__init__.py +8 -0
sglang/api.py +10 -2
sglang/bench_latency.py +151 -40
sglang/bench_serving.py +46 -22
sglang/check_env.py +24 -2
sglang/global_config.py +0 -1
sglang/lang/backend/base_backend.py +3 -1
sglang/lang/backend/openai.py +8 -3
sglang/lang/backend/runtime_endpoint.py +46 -29
sglang/lang/choices.py +164 -0
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +6 -13
sglang/lang/ir.py +14 -5
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/layers/activation.py +33 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +6 -1
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +6 -1
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +4 -7
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +174 -380
sglang/srt/managers/tokenizer_manager.py +197 -112
sglang/srt/managers/tp_worker.py +299 -364
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +10 -15
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +27 -12
sglang/srt/model_executor/forward_batch_info.py +319 -0
sglang/srt/model_executor/model_runner.py +30 -47
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +1 -1
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -2
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +1 -1
sglang/srt/models/internlm2.py +3 -8
sglang/srt/models/llama2.py +5 -5
sglang/srt/models/llama_classification.py +1 -1
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/llava.py +1 -2
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -1
sglang/srt/models/mixtral.py +1 -1
sglang/srt/models/mixtral_quant.py +1 -1
sglang/srt/models/qwen.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +1 -12
sglang/srt/models/stablelm.py +1 -1
sglang/srt/openai_api/adapter.py +189 -39
sglang/srt/openai_api/protocol.py +43 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -4
sglang/srt/server.py +93 -21
sglang/srt/server_args.py +30 -19
sglang/srt/utils.py +31 -13
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +63 -63
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +4 -2
sglang/test/test_utils.py +21 -3
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/METADATA +50 -31
sglang-0.2.12.dist-info/RECORD +112 -0
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang-0.2.10.dist-info/RECORD +0 -100
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -1,17 +1,21 @@
 import json
 from typing import List, Optional
-import numpy as np
 from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template_by_model_path
+from sglang.lang.choices import (
+    ChoicesDecision,
+    ChoicesSamplingMethod,
+    token_length_normalized,
+)
 from sglang.lang.interpreter import StreamExecutor
 from sglang.lang.ir import SglSamplingParams
 from sglang.utils import http_request
 class RuntimeEndpoint(BaseBackend):
     def __init__(
         self,
         base_url: str,
@@ -43,7 +47,7 @@ class RuntimeEndpoint(BaseBackend):
     def flush_cache(self):
         res = http_request(
             self.base_url + "/flush_cache",
-            auth_token=self.auth_token,
+            api_key=self.api_key,
             verify=self.verify,
         )
         self._assert_success(res)
@@ -51,7 +55,7 @@ class RuntimeEndpoint(BaseBackend):
     def get_server_args(self):
         res = http_request(
             self.base_url + "/get_server_args",
-            auth_token=self.auth_token,
+            api_key=self.api_key,
             verify=self.verify,
         )
         self._assert_success(res)
@@ -208,20 +212,14 @@ class RuntimeEndpoint(BaseBackend):
         s: StreamExecutor,
         choices: List[str],
         temperature: float,
-    ):
+        choices_method: ChoicesSamplingMethod,
+    ) -> ChoicesDecision:
         assert temperature <= 1e-5
         # Cache common prefix
         data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
-        self._add_images(s, data)
-        res = http_request(
-            self.base_url + "/generate",
-            json=data,
-            api_key=self.api_key,
-            verify=self.verify,
-        )
-        self._assert_success(res)
-        prompt_len = res.json()["meta_info"]["prompt_tokens"]
+        obj = self._generate_http_request(s, data)
+        prompt_len = obj["meta_info"]["prompt_tokens"]
         # Compute logprob
         data = {
@@ -230,27 +228,35 @@ class RuntimeEndpoint(BaseBackend):
             "return_logprob": True,
             "logprob_start_len": max(prompt_len - 2, 0),
         }
-        self._add_images(s, data)
-        res = http_request(
-            self.base_url + "/generate",
-            json=data,
-            api_key=self.api_key,
-            verify=self.verify,
-        )
-        self._assert_success(res)
-        obj = res.json()
+        obj = self._generate_http_request(s, data)
         normalized_prompt_logprobs = [
             r["meta_info"]["normalized_prompt_logprob"] for r in obj
         ]
-        decision = choices[np.argmax(normalized_prompt_logprobs)]
         input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
         output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
-        return (
-            decision,
-            normalized_prompt_logprobs,
-            input_token_logprobs,
-            output_token_logprobs,
+        # Compute unconditional logprobs if required
+        if choices_method.requires_unconditional_logprobs:
+            input_ids = [[el[1] for el in subl] for subl in input_token_logprobs]
+            data = {
+                "input_ids": input_ids,
+                "sampling_params": {"max_new_tokens": 0},
+                "return_logprob": True,
+            }
+            obj = self._generate_http_request(s, data)
+            unconditional_token_logprobs = [
+                r["meta_info"]["input_token_logprobs"] for r in obj
+            ]
+        else:
+            unconditional_token_logprobs = None
+        return choices_method(
+            choices=choices,
+            normalized_prompt_logprobs=normalized_prompt_logprobs,
+            input_token_logprobs=input_token_logprobs,
+            output_token_logprobs=output_token_logprobs,
+            unconditional_token_logprobs=unconditional_token_logprobs,
         )
     def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
@@ -262,6 +268,17 @@ class RuntimeEndpoint(BaseBackend):
         )
         self._assert_success(res)
+    def _generate_http_request(self, s: StreamExecutor, data):
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        return res.json()
     def _add_images(self, s: StreamExecutor, data):
         if s.images_:
             assert len(s.images_) == 1, "Only support one image."

sglang/lang/choices.py ADDED Viewed

@@ -0,0 +1,164 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import numpy as np
+@dataclass
+class ChoicesDecision:
+    decision: str
+    meta_info: Optional[Dict[str, Any]] = None
+class ChoicesSamplingMethod(ABC):
+    @property
+    def requires_unconditional_logprobs(self) -> bool:
+        return False
+    @abstractmethod
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision: ...
+class TokenLengthNormalized(ChoicesSamplingMethod):
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option with the highest token length normalized prompt logprob."""
+        best_choice = choices[np.argmax(normalized_prompt_logprobs)]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+token_length_normalized = TokenLengthNormalized()
+class GreedyTokenSelection(ChoicesSamplingMethod):
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option based on greedy logprob selection. For overlapping options
+        where one option is a subset of a longer option, extend the shorter option using
+        its average logprob for comparison against the longer option."""
+        num_options = len(choices)
+        max_tokens = max(len(option) for option in input_token_logprobs)
+        logprob_matrix = self._build_logprob_matrix(
+            input_token_logprobs, max_tokens, num_options
+        )
+        remaining = self._greedy_selection(logprob_matrix, num_options, max_tokens)
+        best_choice = choices[remaining[0]]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+            "greedy_logprob_matrix": logprob_matrix.tolist(),
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+    def _build_logprob_matrix(self, input_token_logprobs, max_tokens, num_options):
+        logprob_matrix = np.zeros((num_options, max_tokens))
+        for i, option in enumerate(input_token_logprobs):
+            actual_logprobs = [token[0] for token in option]
+            avg_logprob = np.mean(actual_logprobs)
+            logprob_matrix[i, : len(option)] = actual_logprobs
+            if len(option) < max_tokens:
+                logprob_matrix[i, len(option) :] = avg_logprob
+        return logprob_matrix
+    def _greedy_selection(self, logprob_matrix, num_options, max_tokens):
+        remaining = np.arange(num_options)
+        for j in range(max_tokens):
+            max_logprob = np.max(logprob_matrix[remaining, j])
+            remaining = remaining[logprob_matrix[remaining, j] == max_logprob]
+            if len(remaining) == 1:
+                break
+        return remaining
+greedy_token_selection = GreedyTokenSelection()
+class UnconditionalLikelihoodNormalized(ChoicesSamplingMethod):
+    @property
+    def requires_unconditional_logprobs(self) -> bool:
+        return True
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option with the highest average token logprob once normalized by
+        the unconditional token logprobs.
+        The first unconditional token logprob is assumed to be None. If so, it is
+        replaced with 0 for the purposes of normalization."""
+        if unconditional_token_logprobs is None:
+            raise ValueError(
+                "Unconditional token logprobs are required for this method."
+            )
+        normalized_unconditional_prompt_logprobs = self._normalize_logprobs(
+            input_token_logprobs, unconditional_token_logprobs
+        )
+        best_choice = choices[np.argmax(normalized_unconditional_prompt_logprobs)]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+            "unconditional_token_logprobs": unconditional_token_logprobs,
+            "normalized_unconditional_prompt_logprobs": normalized_unconditional_prompt_logprobs,
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+    def _normalize_logprobs(self, input_token_logprobs, unconditional_token_logprobs):
+        normalized_unconditional_prompt_logprobs = []
+        for inputs, unconditionals in zip(
+            input_token_logprobs, unconditional_token_logprobs
+        ):
+            inputs_logprobs = np.array([token[0] for token in inputs])
+            unconditionals_logprobs = np.array([token[0] for token in unconditionals])
+            unconditionals_logprobs[0] = unconditionals_logprobs[0] or 0
+            normalized_unconditional_prompt_logprobs.append(
+                float(np.mean(inputs_logprobs - unconditionals_logprobs))
+            )
+        return normalized_unconditional_prompt_logprobs
+unconditional_likelihood_normalized = UnconditionalLikelihoodNormalized()

sglang/lang/compiler.py CHANGED Viewed

@@ -125,7 +125,7 @@ class CompiledFunction:
     def run(
         self,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,
@@ -155,7 +155,7 @@ class CompiledFunction:
         self,
         batch_kwargs,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,

sglang/lang/interpreter.py CHANGED Viewed

@@ -538,24 +538,17 @@ class StreamExecutor:
             self.stream_var_event[name].set()
     def _execute_select(self, expr: SglSelect):
-        (
-            decision,
-            normalized_prompt_logprobs,
-            input_token_logprobs,
-            output_token_logprobs,
-        ) = self.backend.select(self, expr.choices, expr.temperature)
+        choices_decision = self.backend.select(
+            self, expr.choices, expr.temperature, expr.choices_method
+        )
         if expr.name is not None:
             name = expr.name
-            self.variables[name] = decision
-            self.meta_info[name] = {
-                "normalized_prompt_logprobs": normalized_prompt_logprobs,
-                "input_token_logprobs": input_token_logprobs,
-                "output_token_logprobs": output_token_logprobs,
-            }
+            self.variables[name] = choices_decision.decision
+            self.meta_info[name] = choices_decision.meta_info
             self.variable_event[name].set()
             if self.stream_var_event:
                 self.stream_var_event[name].set()
-        self.text_ += decision
+        self.text_ += choices_decision.decision
     def _execute_variable(self, expr: SglVariable):
         src_executor = expr.source_stream_executor

sglang/lang/ir.py CHANGED Viewed

@@ -6,6 +6,7 @@ import warnings
 from typing import List, Optional, Union
 from sglang.global_config import global_config
+from sglang.lang.choices import ChoicesSamplingMethod
 REGEX_INT = r"[-+]?[0-9]+"
 REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+"
@@ -15,7 +16,7 @@ REGEX_STRING = r"\"[\w\d\s]*\""  # bugs with regex r"\".*\"" in interegular pkg
 @dataclasses.dataclass
 class SglSamplingParams:
-    max_new_tokens: int = 16
+    max_new_tokens: int = 128
     stop: Union[str, List[str]] = ()
     temperature: float = 1.0
     top_p: float = 1.0
@@ -139,7 +140,7 @@ class SglFunction:
     def run(
         self,
         *args,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,
@@ -178,7 +179,7 @@ class SglFunction:
         self,
         batch_kwargs,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,
@@ -461,14 +462,22 @@ class SglRoleEnd(SglExpr):
 class SglSelect(SglExpr):
-    def __init__(self, name: str, choices: List[str], temperature: float):
+    def __init__(
+        self,
+        name: str,
+        choices: List[str],
+        temperature: float,
+        choices_method: ChoicesSamplingMethod,
+    ):
         super().__init__()
         self.name = name
         self.choices = choices
         self.temperature = temperature
+        self.choices_method = choices_method
     def __repr__(self):
-        return f"Select({self.name}, choices={self.choices})"
+        return f"Select({self.name}, choices={self.choices}, choices_method={self.choices_method})"
 class SglFork(SglExpr):

sglang/srt/constrained/base_tool_cache.py CHANGED Viewed

@@ -54,7 +54,7 @@ class BaseToolCache:
         return val
     def init_value(self, key):
-        raise NotImplementedError
+        raise NotImplementedError()
     def get_cache_hit_rate(self):
         if self.metrics["total"] == 0:

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -20,10 +20,20 @@ from sglang.srt.constrained.base_tool_cache import BaseToolCache
 class FSMCache(BaseToolCache):
-    def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
+    def __init__(
+        self,
+        tokenizer_path,
+        tokenizer_args_dict,
+        enable=True,
+        skip_tokenizer_init=False,
+    ):
         super().__init__(enable=enable)
-        if tokenizer_path.endswith(".json") or tokenizer_path.endswith(".model"):
+        if (
+            skip_tokenizer_init
+            or tokenizer_path.endswith(".json")
+            or tokenizer_path.endswith(".model")
+        ):
             # Do not support TiktokenTokenizer or SentencePieceTokenizer
             return

sglang/srt/layers/activation.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Fused operators for activation layers."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from flashinfer.activation import silu_and_mul
+from vllm.model_executor.custom_op import CustomOp
+class SiluAndMul(CustomOp):
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        silu_and_mul(x, out)
+        return out

sglang/srt/layers/{token_attention.py → decode_attention.py} RENAMED Viewed

@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+"""
+Memory-efficient attention for decoding.
+"""
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
@@ -194,7 +198,7 @@ def _fwd_kernel_stage2(
     tl.store(out_ptrs, acc)
-def _token_att_m_fwd(
+def _decode_att_m_fwd(
     q,
     k_buffer,
     att_out,
@@ -254,7 +258,7 @@ def _token_att_m_fwd(
     )
-def _token_softmax_reducev_fwd(
+def _decode_softmax_reducev_fwd(
     logics,
     v_buffer,
     o,
@@ -292,7 +296,7 @@ def _token_softmax_reducev_fwd(
     )
-def token_attention_fwd(
+def decode_attention_fwd(
     q,
     k_buffer,
     v_buffer,
@@ -312,7 +316,7 @@ def token_attention_fwd(
             (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
         )
-    _token_att_m_fwd(
+    _decode_att_m_fwd(
         q,
         k_buffer,
         att_m,
@@ -324,7 +328,7 @@ def token_attention_fwd(
         sm_scale,
         logit_cap,
     )
-    _token_softmax_reducev_fwd(
+    _decode_softmax_reducev_fwd(
         att_m,
         v_buffer,
         o,

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -13,11 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+"""
+Memory-efficient attention for prefill.
+It supporst page size = 1 and prefill with KV cache (i.e. extend).
+"""
 import torch
 import triton
 import triton.language as tl
-from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
+from sglang.srt.layers.prefill_attention import context_attention_fwd
 CUDA_CAPABILITY = torch.cuda.get_device_capability()

sglang/srt/layers/layernorm.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Fused operators for normalization layers."""
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from vllm.model_executor.custom_op import CustomOp
+class RMSNorm(CustomOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon)
+            return x, residual
+        out = rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -25,7 +25,7 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
 )
-from sglang.srt.model_executor.model_runner import ForwardMode, InputMetadata
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
 @dataclasses.dataclass
@@ -208,6 +208,11 @@ class LogitsProcessor(nn.Module):
                     all_logits = tensor_model_parallel_all_gather(all_logits)
                 all_logits = all_logits[:, : self.config.vocab_size].float()
+                if hasattr(self.config, "final_logit_softcapping"):
+                    all_logits /= self.config.final_logit_softcapping
+                    all_logits = torch.tanh(all_logits)
+                    all_logits *= self.config.final_logit_softcapping
                 all_logprobs = all_logits
                 del all_logits, hidden_states
                 all_logprobs[:] = torch.nn.functional.log_softmax(all_logprobs, dim=-1)

sglang/srt/layers/pooler.py ADDED Viewed

@@ -0,0 +1,50 @@
+# adapted from
+# https://github.com/vllm-project/vllm/blob/82a1b1a82b1fbb454c82a9ef95730b929c9b270c/vllm/model_executor/layers/pooler.py
+from dataclasses import dataclass
+from enum import IntEnum
+import torch
+import torch.nn as nn
+from sglang.srt.model_executor.model_runner import InputMetadata
+class PoolingType(IntEnum):
+    LAST = 0
+@dataclass
+class EmbeddingPoolerOutput:
+    embeddings: torch.Tensor
+class Pooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `PoolerOutput`.
+    Attributes:
+        pooling_type: The type of pooling to use (LAST, AVERAGE, MAX).
+        normalize: Whether to normalize the pooled data.
+    """
+    def __init__(self, pooling_type: PoolingType, normalize: bool):
+        super().__init__()
+        self.pooling_type = pooling_type
+        self.normalize = normalize
+    def forward(
+        self, hidden_states: torch.Tensor, input_metadata: InputMetadata
+    ) -> EmbeddingPoolerOutput:
+        if self.pooling_type == PoolingType.LAST:
+            last_token_indices = torch.cumsum(input_metadata.extend_seq_lens, dim=0) - 1
+            pooled_data = hidden_states[last_token_indices]
+        else:
+            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
+        if self.normalize:
+            pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
+        return EmbeddingPoolerOutput(embeddings=pooled_data)

sglang 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

sglang 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl