PyPI - sglang - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl - Mend

sglang 0.2.14py3-none-any.whl → 0.2.14.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

sglang/srt/constrained/fsm_cache.py +11 -2
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/layers/activation.py +83 -7
sglang/srt/layers/layernorm.py +0 -3
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/sampler.py +15 -68
sglang/srt/managers/schedule_batch.py +15 -20
sglang/srt/managers/tp_worker.py +40 -33
sglang/srt/model_executor/cuda_graph_runner.py +17 -31
sglang/srt/model_executor/forward_batch_info.py +1 -8
sglang/srt/model_executor/model_runner.py +5 -11
sglang/srt/models/chatglm.py +12 -4
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +1 -5
sglang/srt/models/gemma.py +1 -5
sglang/srt/models/gemma2.py +1 -5
sglang/srt/models/gpt_bigcode.py +2 -6
sglang/srt/models/grok.py +1 -5
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/llama2.py +3 -7
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/mixtral.py +1 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/qwen.py +2 -5
sglang/srt/models/qwen2.py +2 -6
sglang/srt/models/qwen2_moe.py +14 -5
sglang/srt/models/stablelm.py +1 -5
sglang/srt/openai_api/adapter.py +85 -4
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/sampling_batch_info.py +1 -74
sglang/srt/sampling/sampling_params.py +4 -0
sglang/srt/server.py +8 -1
sglang/test/runners.py +1 -1
sglang/version.py +1 -1
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/METADATA +10 -4
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/RECORD +42 -42
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/WHEEL +1 -1
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/LICENSE +0 -0
{sglang-0.2.14.dist-info → sglang-0.2.14.post1.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -17,6 +17,7 @@ limitations under the License.
 import bisect
 from contextlib import contextmanager
+from typing import Callable, List
 import torch
 from flashinfer import BatchDecodeWithPagedKVCacheWrapper
@@ -25,18 +26,16 @@ from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
 from sglang.srt.layers.logits_processor import (
+    LogitProcessorOutput,
     LogitsMetadata,
     LogitsProcessor,
-    LogitsProcessorOutput,
 )
-from sglang.srt.layers.sampler import SampleOutput
 from sglang.srt.managers.schedule_batch import ScheduleBatch
 from sglang.srt.model_executor.forward_batch_info import (
     ForwardMode,
     InputMetadata,
     update_flashinfer_indices,
 )
-from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.utils import monkey_patch_vllm_all_gather
@@ -53,12 +52,12 @@ def _to_torch(model: torch.nn.Module, reverse: bool = False):
 @contextmanager
 def patch_model(
-    model: torch.nn.Module, use_compile: bool, tp_group: "GroupCoordinator"
+    model: torch.nn.Module, enable_compile: bool, tp_group: "GroupCoordinator"
 ):
     backup_ca_comm = None
     try:
-        if use_compile:
+        if enable_compile:
             _to_torch(model)
             monkey_patch_vllm_all_gather()
             backup_ca_comm = tp_group.ca_comm
@@ -67,7 +66,7 @@ def patch_model(
         else:
             yield model.forward
     finally:
-        if use_compile:
+        if enable_compile:
             _to_torch(model, reverse=True)
             monkey_patch_vllm_all_gather(reverse=True)
             tp_group.ca_comm = backup_ca_comm
@@ -88,7 +87,7 @@ def set_torch_compile_config():
 class CudaGraphRunner:
     def __init__(
         self,
-        model_runner,
+        model_runner: "ModelRunner",
         max_batch_size_to_capture: int,
         use_torch_compile: bool,
         disable_padding: bool,
@@ -145,22 +144,18 @@ class CudaGraphRunner:
                 self.flashinfer_kv_indices.clone(),
             ]
-        # Sampling inputs
-        vocab_size = model_runner.model_config.vocab_size
-        self.sampling_info = SamplingBatchInfo.dummy_one(self.max_bs, vocab_size)
         self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if use_torch_compile else []
         if use_torch_compile:
             set_torch_compile_config()
-    def can_run(self, batch_size):
+    def can_run(self, batch_size: int):
         if self.disable_padding:
             return batch_size in self.graphs
         else:
             return batch_size <= self.max_bs
-    def capture(self, batch_size_list):
+    def capture(self, batch_size_list: List[int]):
         self.batch_size_list = batch_size_list
         with graph_capture() as graph_capture_context:
             self.stream = graph_capture_context.stream
@@ -181,7 +176,7 @@ class CudaGraphRunner:
                     self.output_buffers[bs] = output_buffers
                     self.flashinfer_handlers[bs] = flashinfer_handler
-    def capture_one_batch_size(self, bs, forward):
+    def capture_one_batch_size(self, bs: int, forward: Callable):
         graph = torch.cuda.CUDAGraph()
         stream = self.stream
@@ -240,7 +235,6 @@ class CudaGraphRunner:
         def run_once():
             input_metadata = InputMetadata(
                 forward_mode=ForwardMode.DECODE,
-                sampling_info=self.sampling_info[:bs],
                 batch_size=bs,
                 req_pool_indices=req_pool_indices,
                 seq_lens=seq_lens,
@@ -305,35 +299,27 @@ class CudaGraphRunner:
             self.flashinfer_handlers[bs],
         )
-        # Sampling inputs
-        self.sampling_info.inplace_assign(raw_bs, batch.sampling_info)
         # Replay
         torch.cuda.synchronize()
         self.graphs[bs].replay()
         torch.cuda.synchronize()
-        sample_output, logits_output = self.output_buffers[bs]
+        output = self.output_buffers[bs]
         # Unpad
         if bs != raw_bs:
-            logits_output = LogitsProcessorOutput(
-                next_token_logits=logits_output.next_token_logits[:raw_bs],
+            output = LogitProcessorOutput(
+                next_token_logits=output.next_token_logits[:raw_bs],
                 next_token_logprobs=None,
                 normalized_prompt_logprobs=None,
                 input_token_logprobs=None,
                 input_top_logprobs=None,
                 output_top_logprobs=None,
             )
-            sample_output = SampleOutput(
-                sample_output.success[:raw_bs],
-                sample_output.probs[:raw_bs],
-                sample_output.batch_next_token_ids[:raw_bs],
-            )
         # Extract logprobs
         if batch.return_logprob:
-            logits_output.next_token_logprobs = torch.nn.functional.log_softmax(
-                logits_output.next_token_logits, dim=-1
+            output.next_token_logprobs = torch.nn.functional.log_softmax(
+                output.next_token_logits, dim=-1
             )
             return_top_logprob = any(x > 0 for x in batch.top_logprobs_nums)
             if return_top_logprob:
@@ -341,8 +327,8 @@ class CudaGraphRunner:
                     forward_mode=ForwardMode.DECODE,
                     top_logprobs_nums=batch.top_logprobs_nums,
                 )
-                logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
-                    logits_output.next_token_logprobs, logits_metadata
+                output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
+                    output.next_token_logprobs, logits_metadata
                 )[1]
-        return sample_output, logits_output
+        return output

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 """
 Copyright 2023-2024 SGLang Team
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +16,7 @@ limitations under the License.
 """ModelRunner runs the forward passes of the models."""
 from dataclasses import dataclass
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
 import numpy as np
 import torch
@@ -28,7 +26,6 @@ from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
-    from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 class ForwardMode(IntEnum):
@@ -45,7 +42,6 @@ class InputMetadata:
     """Store all inforamtion of a forward pass."""
     forward_mode: ForwardMode
-    sampling_info: SamplingBatchInfo
     batch_size: int
     req_pool_indices: torch.Tensor
     seq_lens: torch.Tensor
@@ -183,7 +179,6 @@ class InputMetadata:
     ):
         ret = cls(
             forward_mode=forward_mode,
-            sampling_info=batch.sampling_info,
             batch_size=batch.batch_size(),
             req_pool_indices=batch.req_pool_indices,
             seq_lens=batch.seq_lens,
@@ -194,8 +189,6 @@ class InputMetadata:
             top_logprobs_nums=batch.top_logprobs_nums,
         )
-        ret.sampling_info.prepare_penalties()
         ret.compute_positions(batch)
         ret.compute_extend_infos(batch)

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -21,7 +21,7 @@ import importlib.resources
 import logging
 import pkgutil
 from functools import lru_cache
-from typing import Optional, Tuple, Type
+from typing import Optional, Type
 import torch
 import torch.nn as nn
@@ -44,8 +44,6 @@ from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import ModelRegistry
 from sglang.global_config import global_config
-from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-from sglang.srt.layers.sampler import SampleOutput
 from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
 from sglang.srt.mem_cache.memory_pool import (
     MHATokenToKVPool,
@@ -161,6 +159,8 @@ class ModelRunner:
                 "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
             )
             self.server_args.dtype = "float16"
+            if torch.cuda.get_device_capability()[1] < 5:
+                raise RuntimeError("SGLang only supports sm75 and above.")
         monkey_patch_vllm_dummy_weight_loader()
         self.device_config = DeviceConfig()
@@ -515,11 +515,7 @@ class ModelRunner:
     @torch.inference_mode()
     def forward_decode(self, batch: ScheduleBatch):
-        if (
-            self.cuda_graph_runner
-            and self.cuda_graph_runner.can_run(len(batch.reqs))
-            and not batch.sampling_info.has_bias()
-        ):
+        if self.cuda_graph_runner and self.cuda_graph_runner.can_run(len(batch.reqs)):
             return self.cuda_graph_runner.replay(batch)
         input_metadata = InputMetadata.from_schedule_batch(
@@ -568,9 +564,7 @@ class ModelRunner:
             input_metadata.image_offsets,
         )
-    def forward(
-        self, batch: ScheduleBatch, forward_mode: ForwardMode
-    ) -> Tuple[SampleOutput, LogitsProcessorOutput]:
+    def forward(self, batch: ScheduleBatch, forward_mode: ForwardMode):
         if self.is_multimodal_model and forward_mode == ForwardMode.EXTEND:
             return self.forward_extend_multi_modal(batch)
         elif forward_mode == ForwardMode.DECODE:

sglang/srt/models/chatglm.py CHANGED Viewed

@@ -31,18 +31,20 @@ from vllm.model_executor.layers.linear import (
 )
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
 from vllm.transformers_utils.configs import ChatGLMConfig
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
 LoraConfig = None
@@ -381,11 +383,17 @@ class ChatGLMForCausalLM(nn.Module):
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, input_metadata)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))

sglang/srt/models/commandr.py CHANGED Viewed

@@ -64,7 +64,6 @@ from vllm.model_executor.utils import set_weight_attrs
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -327,7 +326,6 @@ class CohereForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
         self.model = CohereModel(config, quant_config)
     @torch.no_grad()
@@ -342,11 +340,9 @@ class CohereForCausalLM(nn.Module):
             positions,
             input_metadata,
         )
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/dbrx.py CHANGED Viewed

@@ -45,7 +45,6 @@ from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -383,7 +382,6 @@ class DbrxForCausalLM(nn.Module):
             padding_size=DEFAULT_VOCAB_PADDING_SIZE,
         )
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -393,11 +391,9 @@ class DbrxForCausalLM(nn.Module):
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, input_metadata)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         expert_params_mapping = [

sglang/srt/models/deepseek.py CHANGED Viewed

@@ -46,7 +46,6 @@ from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -386,7 +385,6 @@ class DeepseekForCausalLM(nn.Module):
             config.vocab_size, config.hidden_size, quant_config=quant_config
         )
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -396,11 +394,9 @@ class DeepseekForCausalLM(nn.Module):
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -45,7 +45,6 @@ from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -633,7 +632,6 @@ class DeepseekV2ForCausalLM(nn.Module):
             config.vocab_size, config.hidden_size, quant_config=quant_config
         )
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     def forward(
         self,
@@ -642,11 +640,9 @@ class DeepseekV2ForCausalLM(nn.Module):
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/gemma.py CHANGED Viewed

@@ -37,7 +37,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -288,7 +287,6 @@ class GemmaForCausalLM(nn.Module):
         self.quant_config = quant_config
         self.model = GemmaModel(config, quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -299,11 +297,9 @@ class GemmaForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return (sample_output, logits_output)
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/gemma2.py CHANGED Viewed

@@ -41,7 +41,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.activation import GeluAndMul
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -397,7 +396,6 @@ class Gemma2ForCausalLM(nn.Module):
         self.quant_config = quant_config
         self.model = Gemma2Model(config, cache_config, quant_config)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -408,11 +406,9 @@ class Gemma2ForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def get_attention_sliding_window_size(self):
         return get_attention_sliding_window_size(self.config)

sglang/srt/models/gpt_bigcode.py CHANGED Viewed

@@ -23,7 +23,6 @@ from torch import nn
 from transformers import GPTBigCodeConfig
 from vllm.config import CacheConfig, LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -33,9 +32,9 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from sglang.srt.layers.activation import get_act_fn
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -262,7 +261,6 @@ class GPTBigCodeForCausalLM(nn.Module):
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -272,11 +270,9 @@ class GPTBigCodeForCausalLM(nn.Module):
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, input_metadata)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters(remove_duplicate=False))

sglang/srt/models/grok.py CHANGED Viewed

@@ -46,7 +46,6 @@ from sglang.srt.layers.fused_moe import FusedMoE
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -298,7 +297,6 @@ class Grok1ModelForCausalLM(nn.Module):
         self.model = Grok1Model(config, quant_config=quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
         # Monkey patch _prepare_weights to load pre-sharded weights
         setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
@@ -315,11 +313,9 @@ class Grok1ModelForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/internlm2.py CHANGED Viewed

@@ -40,7 +40,6 @@ from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -263,7 +262,6 @@ class InternLM2ForCausalLM(nn.Module):
         self.model = InternLM2Model(config, quant_config)
         self.output = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -274,11 +272,9 @@ class InternLM2ForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.output.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/llama2.py CHANGED Viewed

@@ -39,9 +39,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
-from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.logits_processor import LogitProcessorOutput, LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -303,7 +302,6 @@ class LlamaForCausalLM(nn.Module):
         self.model = LlamaModel(config, quant_config=quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -312,13 +310,11 @@ class LlamaForCausalLM(nn.Module):
         positions: torch.Tensor,
         input_metadata: InputMetadata,
         input_embeds: torch.Tensor = None,
-    ) -> LogitsProcessorOutput:
+    ) -> LogitProcessorOutput:
         hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, self.lm_head.weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def get_module_name(self, name):
         stacked_params_mapping = [

sglang/srt/models/llama_classification.py CHANGED Viewed

@@ -24,7 +24,7 @@ from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.logits_processor import LogitProcessorOutput
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
 from sglang.srt.models.llama2 import LlamaModel
@@ -65,7 +65,7 @@ class LlamaForClassification(nn.Module):
                 (input_metadata.batch_size, self.config.classification_out_size)
             ).to(input_ids.device)
-        return LogitsProcessorOutput(
+        return LogitProcessorOutput(
             next_token_logits=scores,
             next_token_logprobs=scores,
             normalized_prompt_logprobs=scores,

sglang/srt/models/minicpm.py CHANGED Viewed

@@ -39,7 +39,6 @@ from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.layers.sampler import Sampler
 from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -298,7 +297,6 @@ class MiniCPMForCausalLM(nn.Module):
         self.scale_width = self.config.hidden_size / self.config.dim_model_base
         self.logits_processor = LogitsProcessor(config)
-        self.sampler = Sampler()
     @torch.no_grad()
     def forward(
@@ -316,11 +314,9 @@ class MiniCPMForCausalLM(nn.Module):
             lm_head_weight = self.model.embed_tokens.weight
         else:
             lm_head_weight = self.lm_head.weight
-        logits_output = self.logits_processor(
+        return self.logits_processor(
             input_ids, hidden_states, lm_head_weight, input_metadata
         )
-        sample_output = self.sampler(logits_output, input_metadata.sampling_info)
-        return sample_output, logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang 0.2.14__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl

sglang 0.2.14py3-none-any.whl → 0.2.14.post1py3-none-any.whl