PyPI - sglang - Versions diffs - 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

sglang 0.2.11py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sglang/bench_latency.py +6 -4
sglang/bench_serving.py +46 -22
sglang/lang/compiler.py +2 -2
sglang/lang/ir.py +3 -3
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/layers/activation.py +33 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +6 -1
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +5 -0
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +110 -87
sglang/srt/managers/tokenizer_manager.py +193 -111
sglang/srt/managers/tp_worker.py +289 -352
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +168 -105
sglang/srt/model_executor/model_runner.py +24 -37
sglang/srt/models/gemma2.py +0 -1
sglang/srt/models/internlm2.py +2 -7
sglang/srt/models/llama2.py +4 -4
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/qwen2_moe.py +0 -11
sglang/srt/openai_api/adapter.py +155 -27
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -4
sglang/srt/server.py +69 -15
sglang/srt/server_args.py +26 -19
sglang/srt/utils.py +31 -13
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +63 -63
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +4 -2
sglang/test/test_utils.py +20 -2
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/METADATA +23 -14
sglang-0.2.12.dist-info/RECORD +112 -0
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang-0.2.11.dist-info/RECORD +0 -102
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -152,7 +152,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
         req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
         req.prefix_indices = []
         req.sampling_params = sampling_params
-        req.input_ids = req.origin_input_ids
+        req.fill_ids = req.origin_input_ids
         reqs.append(req)
     return input_ids, reqs
@@ -163,7 +163,7 @@ def prepare_extend_inputs_for_correctness_test(
 ):
     for i in range(len(reqs)):
         req = reqs[i]
-        req.input_ids += input_ids[i][bench_args.cut_len :]
+        req.fill_ids += input_ids[i][bench_args.cut_len :]
         req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
             i, : bench_args.cut_len
         ]
@@ -182,7 +182,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
         req = Req(rid=i, origin_input_text="", origin_input_ids=list(input_ids[i]))
         req.prefix_indices = []
         req.sampling_params = sampling_params
-        req.input_ids = req.origin_input_ids
+        req.fill_ids = req.origin_input_ids
         reqs.append(req)
     return reqs
@@ -238,7 +238,7 @@ def correctness_test(
     # Decode
     output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
-    for _ in range(bench_args.output_len):
+    for _ in range(bench_args.output_len[0]):
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         for i in range(len(reqs)):
             output_ids[i].append(next_token_ids[i])
@@ -332,6 +332,7 @@ def latency_test(
     )
     # Warm up
+    rank_print("Warmup ...")
     latency_test_run_once(
         bench_args.run_name,
         model_runner,
@@ -341,6 +342,7 @@ def latency_test(
         bench_args.input_len[0],
         4,  # shorter decoding to speed up the warmup
     )
+    rank_print("Benchmark ...")
     # Run the sweep
     result_list = []

sglang/bench_serving.py CHANGED Viewed

@@ -24,7 +24,7 @@ import warnings
 from argparse import ArgumentParser
 from dataclasses import dataclass, field
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional, Tuple, Union
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
 import aiohttp
 import numpy as np
@@ -39,6 +39,8 @@ from transformers import (
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+global args
 @dataclass
 class RequestFuncInput:
@@ -47,6 +49,7 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
+    extra_request_body: Dict[str, Any]
 @dataclass
@@ -84,6 +87,7 @@ async def async_request_trt_llm(
             "stream": True,
             "min_length": request_func_input.output_len,
             "end_id": 1048576,
+            **request_func_input.extra_request_body,
         }
         if args.disable_ignore_eos:
             del payload["min_length"]
@@ -154,6 +158,7 @@ async def async_request_openai_completions(
             "max_tokens": request_func_input.output_len,
             "stream": not args.disable_stream,
             "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
         }
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
@@ -192,7 +197,8 @@ async def async_request_openai_completions(
                                     output.ttft = ttft
                                 # Decoding phase
-                                output.itl.append(timestamp - most_recent_timestamp)
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
                                 most_recent_timestamp = timestamp
                                 generated_text += data["choices"][0]["text"]
@@ -542,6 +548,7 @@ async def benchmark(
     request_rate: float,
     disable_tqdm: bool,
     enable_multi: bool,
+    extra_request_body: Dict[str, Any],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -556,6 +563,7 @@ async def benchmark(
         api_url=api_url,
         prompt_len=test_prompt_len,
         output_len=test_output_len,
+        extra_request_body=extra_request_body,
     )
     test_output = await request_func(request_func_input=test_input)
     if not test_output.success:
@@ -578,6 +586,7 @@ async def benchmark(
             api_url=api_url,
             prompt_len=prompt_len,
             output_len=output_len,
+            extra_request_body=extra_request_body,
         )
         tasks.append(
             asyncio.create_task(
@@ -660,19 +669,20 @@ async def benchmark(
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": request_rate,
-            "total_input": metrics.total_input,
-            "total_output": metrics.total_output,
-            "total_output_retokenized": metrics.total_output_retokenized,
-            "mean_e2e_latency": metrics.mean_e2e_latency_ms,
-            "median_e2e_latency": metrics.median_e2e_latency_ms,
-            "median_ttft": metrics.median_ttft_ms,
-            "median_itl": metrics.median_itl_ms,
-            "output_token_throughput": metrics.output_throughput,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "median_ttft_ms": metrics.median_ttft_ms,
+            "median_itl_ms": metrics.median_itl_ms,
+            "output_throughput": metrics.output_throughput,
             "sharegpt_output_len": args.sharegpt_output_len,
             "random_input_len": args.random_input_len,
             "random_output_len": args.random_output_len,
             "random_range_ratio": args.random_range_ratio,
-            "benchmark_duration": benchmark_duration,
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
         }
     else:
         print(f"Error running benchmark for request rate: {request_rate}")
@@ -742,10 +752,18 @@ def check_chat_template(model_path):
         return False
-def fire(args: argparse.Namespace):
+def run_benchmark(args_: argparse.Namespace):
+    global args
+    args = args_
+    set_ulimit()
     random.seed(args.seed)
     np.random.seed(args.seed)
+    extra_request_body = {}
+    if args.extra_request_body:
+        extra_request_body = json.loads(args.extra_request_body)
     if args.port is None:
         args.port = {
             "sglang": 30000,
@@ -838,10 +856,11 @@ def fire(args: argparse.Namespace):
                     request_rate=rate,
                     disable_tqdm=args.disable_tqdm,
                     enable_multi=args.multi,
+                    extra_request_body=extra_request_body,
                 )
             )
     else:
-        asyncio.run(
+        return asyncio.run(
             benchmark(
                 backend=backend,
                 api_url=api_url,
@@ -851,6 +870,7 @@ def fire(args: argparse.Namespace):
                 request_rate=args.request_rate,
                 disable_tqdm=args.disable_tqdm,
                 enable_multi=args.multi,
+                extra_request_body=extra_request_body,
             )
         )
@@ -949,11 +969,6 @@ if __name__ == "__main__":
         "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
     )
     parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
-    parser.add_argument(
-        "--disable-tqdm",
-        action="store_true",
-        help="Specify to disable tqdm progress bar.",
-    )
     parser.add_argument(
         "--multi",
         action="store_true",
@@ -966,6 +981,11 @@ if __name__ == "__main__":
         help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
     )
     parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
     parser.add_argument(
         "--disable-stream",
         action="store_true",
@@ -976,8 +996,12 @@ if __name__ == "__main__":
         action="store_true",
         help="Disable ignoring EOS.",
     )
-    set_ulimit()
+    parser.add_argument(
+        "--extra-request-body",
+        metavar='{"key1": "value1", "key2": "value2"}',
+        type=str,
+        help="Append given JSON object to the request payload. You can use this to specify"
+        "additional generate params like sampling params.",
+    )
     args = parser.parse_args()
-    fire(args)
+    run_benchmark(args)

sglang/lang/compiler.py CHANGED Viewed

@@ -125,7 +125,7 @@ class CompiledFunction:
     def run(
         self,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,
@@ -155,7 +155,7 @@ class CompiledFunction:
         self,
         batch_kwargs,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,

sglang/lang/ir.py CHANGED Viewed

@@ -16,7 +16,7 @@ REGEX_STRING = r"\"[\w\d\s]*\""  # bugs with regex r"\".*\"" in interegular pkg
 @dataclasses.dataclass
 class SglSamplingParams:
-    max_new_tokens: int = 16
+    max_new_tokens: int = 128
     stop: Union[str, List[str]] = ()
     temperature: float = 1.0
     top_p: float = 1.0
@@ -140,7 +140,7 @@ class SglFunction:
     def run(
         self,
         *args,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,
@@ -179,7 +179,7 @@ class SglFunction:
         self,
         batch_kwargs,
         *,
-        max_new_tokens: int = 16,
+        max_new_tokens: int = 128,
         stop: Union[str, List[str]] = (),
         temperature: float = 1.0,
         top_p: float = 1.0,

sglang/srt/constrained/base_tool_cache.py CHANGED Viewed

@@ -54,7 +54,7 @@ class BaseToolCache:
         return val
     def init_value(self, key):
-        raise NotImplementedError
+        raise NotImplementedError()
     def get_cache_hit_rate(self):
         if self.metrics["total"] == 0:

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -20,10 +20,20 @@ from sglang.srt.constrained.base_tool_cache import BaseToolCache
 class FSMCache(BaseToolCache):
-    def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
+    def __init__(
+        self,
+        tokenizer_path,
+        tokenizer_args_dict,
+        enable=True,
+        skip_tokenizer_init=False,
+    ):
         super().__init__(enable=enable)
-        if tokenizer_path.endswith(".json") or tokenizer_path.endswith(".model"):
+        if (
+            skip_tokenizer_init
+            or tokenizer_path.endswith(".json")
+            or tokenizer_path.endswith(".model")
+        ):
             # Do not support TiktokenTokenizer or SentencePieceTokenizer
             return

sglang/srt/layers/activation.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Fused operators for activation layers."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from flashinfer.activation import silu_and_mul
+from vllm.model_executor.custom_op import CustomOp
+class SiluAndMul(CustomOp):
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        silu_and_mul(x, out)
+        return out

sglang/srt/layers/{token_attention.py → decode_attention.py} RENAMED Viewed

@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+"""
+Memory-efficient attention for decoding.
+"""
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
@@ -194,7 +198,7 @@ def _fwd_kernel_stage2(
     tl.store(out_ptrs, acc)
-def _token_att_m_fwd(
+def _decode_att_m_fwd(
     q,
     k_buffer,
     att_out,
@@ -254,7 +258,7 @@ def _token_att_m_fwd(
     )
-def _token_softmax_reducev_fwd(
+def _decode_softmax_reducev_fwd(
     logics,
     v_buffer,
     o,
@@ -292,7 +296,7 @@ def _token_softmax_reducev_fwd(
     )
-def token_attention_fwd(
+def decode_attention_fwd(
     q,
     k_buffer,
     v_buffer,
@@ -312,7 +316,7 @@ def token_attention_fwd(
             (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
         )
-    _token_att_m_fwd(
+    _decode_att_m_fwd(
         q,
         k_buffer,
         att_m,
@@ -324,7 +328,7 @@ def token_attention_fwd(
         sm_scale,
         logit_cap,
     )
-    _token_softmax_reducev_fwd(
+    _decode_softmax_reducev_fwd(
         att_m,
         v_buffer,
         o,

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -13,11 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+"""
+Memory-efficient attention for prefill.
+It supporst page size = 1 and prefill with KV cache (i.e. extend).
+"""
 import torch
 import triton
 import triton.language as tl
-from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
+from sglang.srt.layers.prefill_attention import context_attention_fwd
 CUDA_CAPABILITY = torch.cuda.get_device_capability()

sglang/srt/layers/layernorm.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Fused operators for normalization layers."""
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from vllm.model_executor.custom_op import CustomOp
+class RMSNorm(CustomOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon)
+            return x, residual
+        out = rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -208,6 +208,11 @@ class LogitsProcessor(nn.Module):
                     all_logits = tensor_model_parallel_all_gather(all_logits)
                 all_logits = all_logits[:, : self.config.vocab_size].float()
+                if hasattr(self.config, "final_logit_softcapping"):
+                    all_logits /= self.config.final_logit_softcapping
+                    all_logits = torch.tanh(all_logits)
+                    all_logits *= self.config.final_logit_softcapping
                 all_logprobs = all_logits
                 del all_logits, hidden_states
                 all_logprobs[:] = torch.nn.functional.log_softmax(all_logprobs, dim=-1)

sglang/srt/layers/pooler.py ADDED Viewed

@@ -0,0 +1,50 @@
+# adapted from
+# https://github.com/vllm-project/vllm/blob/82a1b1a82b1fbb454c82a9ef95730b929c9b270c/vllm/model_executor/layers/pooler.py
+from dataclasses import dataclass
+from enum import IntEnum
+import torch
+import torch.nn as nn
+from sglang.srt.model_executor.model_runner import InputMetadata
+class PoolingType(IntEnum):
+    LAST = 0
+@dataclass
+class EmbeddingPoolerOutput:
+    embeddings: torch.Tensor
+class Pooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `PoolerOutput`.
+    Attributes:
+        pooling_type: The type of pooling to use (LAST, AVERAGE, MAX).
+        normalize: Whether to normalize the pooled data.
+    """
+    def __init__(self, pooling_type: PoolingType, normalize: bool):
+        super().__init__()
+        self.pooling_type = pooling_type
+        self.normalize = normalize
+    def forward(
+        self, hidden_states: torch.Tensor, input_metadata: InputMetadata
+    ) -> EmbeddingPoolerOutput:
+        if self.pooling_type == PoolingType.LAST:
+            last_token_indices = torch.cumsum(input_metadata.extend_seq_lens, dim=0) - 1
+            pooled_data = hidden_states[last_token_indices]
+        else:
+            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
+        if self.normalize:
+            pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
+        return EmbeddingPoolerOutput(embeddings=pooled_data)

sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} RENAMED Viewed

@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+"""
+Memory-efficient attention for prefill.
+It supporst page size = 1.
+"""
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
 import torch

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -20,8 +20,8 @@ from flashinfer.cascade import merge_state
 from torch import nn
 from sglang.global_config import global_config
+from sglang.srt.layers.decode_attention import decode_attention_fwd
 from sglang.srt.layers.extend_attention import extend_attention_fwd
-from sglang.srt.layers.token_attention import token_attention_fwd
 from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
 from sglang.srt.model_executor.model_runner import global_server_args_dict
@@ -95,7 +95,7 @@ class RadixAttention(nn.Module):
             o = torch.empty_like(q)
         self.store_kv_cache(k, v, input_metadata)
-        token_attention_fwd(
+        decode_attention_fwd(
             q.view(-1, self.tp_q_head_num, self.qk_head_dim),
             input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id),
             input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id),

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -25,10 +25,14 @@ import zmq
 import zmq.asyncio
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut
+from sglang.srt.managers.io_struct import (
+    BatchEmbeddingOut,
+    BatchStrOut,
+    BatchTokenIDOut,
+)
 from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.utils import find_printable_text, get_exception_traceback, graceful_registry
+from sglang.utils import find_printable_text, get_exception_traceback
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
@@ -55,20 +59,40 @@ class DetokenizerManager:
         self.send_to_tokenizer = context.socket(zmq.PUSH)
         self.send_to_tokenizer.connect(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
-        self.tokenizer = get_tokenizer(
-            server_args.tokenizer_path,
-            tokenizer_mode=server_args.tokenizer_mode,
-            trust_remote_code=server_args.trust_remote_code,
-        )
+        if server_args.skip_tokenizer_init:
+            self.tokenizer = None
+        else:
+            self.tokenizer = get_tokenizer(
+                server_args.tokenizer_path,
+                tokenizer_mode=server_args.tokenizer_mode,
+                trust_remote_code=server_args.trust_remote_code,
+            )
         self.decode_status = {}
     async def handle_loop(self):
         while True:
             recv_obj: BatchTokenIDOut = await self.recv_from_router.recv_pyobj()
+            if isinstance(recv_obj, BatchEmbeddingOut):
+                self.send_to_tokenizer.send_pyobj(
+                    BatchEmbeddingOut(
+                        rids=recv_obj.rids,
+                        embeddings=recv_obj.embeddings,
+                        meta_info=recv_obj.meta_info,
+                        finished_reason=recv_obj.finished_reason,
+                    )
+                )
+                continue
             assert isinstance(recv_obj, BatchTokenIDOut)
             bs = len(recv_obj.rids)
+            if self.tokenizer is None:
+                # Send BatchTokenIDOut if no tokenizer init'ed.
+                self.send_to_tokenizer.send_pyobj(recv_obj)
+                continue
             # Initialize decode status
             read_ids, surr_ids = [], []
             for i in range(bs):
@@ -140,8 +164,6 @@ def start_detokenizer_process(
     port_args: PortArgs,
     pipe_writer,
 ):
-    graceful_registry(inspect.currentframe().f_code.co_name)
     try:
         manager = DetokenizerManager(server_args, port_args)
     except Exception:

sglang 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl

sglang 0.2.11py3-none-any.whl → 0.2.12py3-none-any.whl