PyPI - sglang - Versions diffs - 0.2.14__py3-none-any.whl → 0.2.14.post2__py3-none-any.whl - Mend

sglang 0.2.14py3-none-any.whl → 0.2.14.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

sglang/launch_server_llavavid.py +26 -0
sglang/srt/constrained/fsm_cache.py +11 -2
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/hf_transformers_utils.py +0 -149
sglang/srt/layers/activation.py +93 -11
sglang/srt/layers/layernorm.py +47 -4
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/sampler.py +15 -68
sglang/srt/managers/io_struct.py +5 -4
sglang/srt/managers/schedule_batch.py +20 -25
sglang/srt/managers/tokenizer_manager.py +74 -61
sglang/srt/managers/tp_worker.py +49 -43
sglang/srt/model_executor/cuda_graph_runner.py +17 -31
sglang/srt/model_executor/forward_batch_info.py +9 -26
sglang/srt/model_executor/model_runner.py +20 -17
sglang/srt/models/chatglm.py +13 -5
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +1 -5
sglang/srt/models/gemma.py +3 -7
sglang/srt/models/gemma2.py +2 -56
sglang/srt/models/gpt_bigcode.py +2 -6
sglang/srt/models/grok.py +10 -8
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/llama2.py +6 -11
sglang/srt/models/llama_classification.py +2 -6
sglang/srt/models/llama_embedding.py +3 -4
sglang/srt/models/llava.py +69 -91
sglang/srt/models/llavavid.py +40 -86
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/mixtral.py +1 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/qwen.py +2 -5
sglang/srt/models/qwen2.py +5 -10
sglang/srt/models/qwen2_moe.py +21 -24
sglang/srt/models/stablelm.py +1 -5
sglang/srt/models/yivl.py +2 -7
sglang/srt/openai_api/adapter.py +85 -4
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/sampling_batch_info.py +1 -74
sglang/srt/sampling/sampling_params.py +4 -0
sglang/srt/server.py +11 -4
sglang/srt/utils.py +18 -33
sglang/test/runners.py +2 -2
sglang/test/test_layernorm.py +53 -1
sglang/version.py +1 -1
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/METADATA +11 -5
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/RECORD +52 -51
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/WHEEL +1 -1
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/LICENSE +0 -0
{sglang-0.2.14.dist-info → sglang-0.2.14.post2.dist-info}/top_level.txt +0 -0

sglang/launch_server_llavavid.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Launch the inference server for Llava-video model."""
+import argparse
+from sglang.srt.server import ServerArgs, launch_server
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    model_overide_args = {}
+    model_overide_args["mm_spatial_pool_stride"] = 2
+    model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
+    model_overide_args["num_frames"] = 16
+    model_overide_args["model_type"] = "llavavid"
+    if model_overide_args["num_frames"] == 32:
+        model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
+        model_overide_args["max_sequence_length"] = 4096 * 2
+        model_overide_args["tokenizer_model_max_length"] = 4096 * 2
+        model_overide_args["model_max_length"] = 4096 * 2
+    if "34b" in args.model_path.lower():
+        model_overide_args["image_token_index"] = 64002
+    launch_server(server_args, model_overide_args, None)

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -15,6 +15,8 @@ limitations under the License.
 """Cache for the compressed finite state machine."""
+from outlines.fsm.json_schema import build_regex_from_schema
 from sglang.srt.constrained import RegexGuide, TransformerTokenizer
 from sglang.srt.constrained.base_tool_cache import BaseToolCache
@@ -26,9 +28,12 @@ class FSMCache(BaseToolCache):
         tokenizer_args_dict,
         enable=True,
         skip_tokenizer_init=False,
+        json_schema_mode=False,
     ):
         super().__init__(enable=enable)
+        self.json_schema_mode = json_schema_mode
         if (
             skip_tokenizer_init
             or tokenizer_path.endswith(".json")
@@ -72,5 +77,9 @@ class FSMCache(BaseToolCache):
                 tokenizer_path, **tokenizer_args_dict
             )
-    def init_value(self, regex):
-        return RegexGuide(regex, self.outlines_tokenizer)
+    def init_value(self, value):
+        if self.json_schema_mode:
+            regex = build_regex_from_schema(value)
+            return RegexGuide(regex, self.outlines_tokenizer), regex
+        else:
+            return RegexGuide(value, self.outlines_tokenizer)

sglang/srt/constrained/jump_forward.py CHANGED Viewed

@@ -23,6 +23,7 @@ from collections import defaultdict
 import interegular
 import outlines.caching
+from outlines.fsm.json_schema import build_regex_from_schema
 from sglang.srt.constrained import (
     FSMInfo,

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -119,24 +119,7 @@ def get_tokenizer(
     tokenizer_revision: Optional[str] = None,
     **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-    if tokenizer_name.endswith(".json"):
-        return TiktokenTokenizer(tokenizer_name)
-    if tokenizer_name.endswith(".model"):
-        return SentencePieceTokenizer(tokenizer_name)
     """Gets a tokenizer for the given model name via Huggingface."""
-    if is_multimodal_model(tokenizer_name):
-        processor = get_processor(
-            tokenizer_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            tokenizer_revision=tokenizer_revision,
-            **kwargs,
-        )
-        tokenizer = processor.tokenizer
-        return tokenizer
     if tokenizer_mode == "slow":
         if kwargs.get("use_fast", False):
             raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
@@ -199,135 +182,3 @@ def get_processor(
         **kwargs,
     )
     return processor
-class TiktokenTokenizer:
-    def __init__(self, tokenizer_path):
-        import tiktoken
-        from jinja2 import Template
-        PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
-        # Read JSON
-        name = "tmp-json"
-        with open(tokenizer_path, "rb") as fin:
-            tok_dict = json.load(fin)
-        mergeable_ranks = {
-            bytes(item["bytes"]): item["token"] for item in tok_dict["regular_tokens"]
-        }
-        special_tokens = {
-            bytes(item["bytes"]).decode(): item["token"]
-            for item in tok_dict["special_tokens"]
-        }
-        assert tok_dict["word_split"] == "V1"
-        default_allowed_special = None
-        kwargs = {
-            "name": name,
-            "pat_str": tok_dict.get("pat_str", PAT_STR_B),
-            "mergeable_ranks": mergeable_ranks,
-            "special_tokens": special_tokens,
-        }
-        if "default_allowed_special" in tok_dict:
-            default_allowed_special = set(
-                [
-                    bytes(bytes_list).decode()
-                    for bytes_list in tok_dict["default_allowed_special"]
-                ]
-            )
-        if "vocab_size" in tok_dict:
-            kwargs["explicit_n_vocab"] = tok_dict["vocab_size"]
-        PAD = "<|pad|>"
-        EOS = "<|eos|>"
-        SEP = "<|separator|>"
-        DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP}
-        tokenizer = tiktoken.Encoding(**kwargs)
-        tokenizer._default_allowed_special = default_allowed_special or set()
-        tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
-        def encode_patched(
-            self,
-            text: str,
-            *,
-            allowed_special: Union[
-                Literal["all"], AbstractSet[str]
-            ] = set(),  # noqa: B006
-            disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-        ) -> List[int]:
-            if isinstance(allowed_special, set):
-                allowed_special |= self._default_allowed_special
-            return tiktoken.Encoding.encode(
-                self,
-                text,
-                allowed_special=allowed_special,
-                disallowed_special=(),
-            )
-        tokenizer.encode = functools.partial(encode_patched, tokenizer)
-        # Convert to HF interface
-        self.tokenizer = tokenizer
-        self.eos_token_id = tokenizer._special_tokens[EOS]
-        self.vocab_size = tokenizer.n_vocab
-        self.chat_template = Template(
-            "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: '  + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
-        )
-    def encode(self, x, add_special_tokens=False):
-        return self.tokenizer.encode(x)
-    def decode(self, x):
-        return self.tokenizer.decode(x)
-    def batch_decode(
-        self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
-    ):
-        if isinstance(batch[0], int):
-            batch = [[x] for x in batch]
-        return self.tokenizer.decode_batch(batch)
-    def apply_chat_template(self, messages, tokenize, add_generation_prompt):
-        ret = self.chat_template.render(
-            messages=messages, add_generation_prompt=add_generation_prompt
-        )
-        return self.encode(ret) if tokenize else ret
-class SentencePieceTokenizer:
-    def __init__(self, tokenizer_path):
-        import sentencepiece as spm
-        from jinja2 import Template
-        tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
-        # Convert to HF interface
-        self.tokenizer = tokenizer
-        self.eos_token_id = tokenizer.eos_id()
-        self.vocab_size = tokenizer.vocab_size()
-        self.chat_template = Template(
-            "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: '  + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
-        )
-    def encode(self, x, add_special_tokens=False):
-        return self.tokenizer.encode(x)
-    def decode(self, x):
-        return self.tokenizer.decode(x)
-    def batch_decode(
-        self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
-    ):
-        if isinstance(batch[0], int):
-            batch = [[x] for x in batch]
-        return self.tokenizer.decode(batch)
-    def apply_chat_template(self, messages, tokenize, add_generation_prompt):
-        ret = self.chat_template.render(
-            messages=messages, add_generation_prompt=add_generation_prompt
-        )
-        return self.encode(ret) if tokenize else ret

sglang/srt/layers/activation.py CHANGED Viewed

@@ -13,25 +13,28 @@ limitations under the License.
 """Fused operators for activation layers."""
+from typing import Optional
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from flashinfer.activation import gelu_tanh_and_mul, silu_and_mul
+from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+from vllm.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.utils import set_weight_attrs
 class SiluAndMul(CustomOp):
-    def __init__(self, **kwargs):
-        super().__init__()
-        self.is_lower_sm80 = torch.cuda.get_device_capability()[0] < 8
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        if self.is_lower_sm80:
-            return self.forward_native(x)
         d = x.shape[-1] // 2
         output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
@@ -40,16 +43,95 @@ class SiluAndMul(CustomOp):
 class GeluAndMul(CustomOp):
-    def __init__(self, **kwargs):
+    def __init__(self, approximate="tanh"):
         super().__init__()
+        self.approximate = approximate
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        return F.gelu(x[..., :d], approximate="tanh") * x[..., d:]
+        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
         output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        gelu_tanh_and_mul(x, out)
+        if self.approximate == "tanh":
+            gelu_tanh_and_mul(x, out)
+        elif self.approximate == "none":
+            gelu_and_mul(x, out)
+        else:
+            raise RuntimeError("GeluAndMul only support tanh or none")
         return out
+class ScaledActivation(nn.Module):
+    """An activation function with post-scale parameters.
+    This is used for some quantization methods like AWQ.
+    """
+    def __init__(
+        self,
+        act_module: nn.Module,
+        intermediate_size: int,
+        input_is_parallel: bool = True,
+        params_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.act = act_module
+        self.input_is_parallel = input_is_parallel
+        if input_is_parallel:
+            tp_size = get_tensor_model_parallel_world_size()
+            intermediate_size_per_partition = divide(intermediate_size, tp_size)
+        else:
+            intermediate_size_per_partition = intermediate_size
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.scales = nn.Parameter(
+            torch.empty(intermediate_size_per_partition, dtype=params_dtype)
+        )
+        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.act(x) / self.scales
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        if self.input_is_parallel:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = param_data.shape[0]
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+_ACTIVATION_REGISTRY = {
+    "gelu": nn.GELU(),
+    "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
+}
+def get_act_fn(
+    act_fn_name: str,
+    quant_config: Optional[QuantizationConfig] = None,
+    intermediate_size: Optional[int] = None,
+    input_is_parallel: bool = True,
+    params_dtype: Optional[torch.dtype] = None,
+) -> nn.Module:
+    """Get an activation function by name."""
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_REGISTRY:
+        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")
+    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
+    if quant_config is not None and act_fn_name in quant_config.get_scaled_act_names():
+        if intermediate_size is None:
+            raise ValueError(
+                "intermediate_size must be specified for scaled "
+                "activation functions."
+            )
+        return ScaledActivation(
+            act_fn, intermediate_size, input_is_parallel, params_dtype
+        )
+    return act_fn

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -19,7 +19,12 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from flashinfer.norm import (
+    fused_add_rmsnorm,
+    gemma_fused_add_rmsnorm,
+    gemma_rmsnorm,
+    rmsnorm,
+)
 from vllm.model_executor.custom_op import CustomOp
@@ -32,15 +37,12 @@ class RMSNorm(CustomOp):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
-        self.is_lower_sm80 = torch.cuda.get_device_capability()[0] < 8
     def forward_cuda(
         self,
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        if self.is_lower_sm80:
-            return self.forward_native(x, residual)
         if residual is not None:
             fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon)
@@ -66,3 +68,44 @@ class RMSNorm(CustomOp):
             return x
         else:
             return x, residual
+class GemmaRMSNorm(CustomOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        if residual is not None:
+            x = x + residual
+            residual = x
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x * (1.0 + self.weight.float())
+        x = x.to(orig_dtype)
+        return x if residual is None else (x, residual)
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            gemma_fused_add_rmsnorm(
+                x, residual, self.weight.data, self.variance_epsilon
+            )
+            return x, residual
+        out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -29,7 +29,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetad
 @dataclasses.dataclass
-class LogitsProcessorOutput:
+class LogitProcessorOutput:
     # The logits of the next tokens.       shape: [#seq, vocab_size]
     next_token_logits: torch.Tensor
     # The logprobs of the next tokens.     shape: [#seq, vocab_size]
@@ -185,7 +185,7 @@ class LogitsProcessor(nn.Module):
         # Return only last_logits if logprob is not requested
         if not logits_metadata.return_logprob:
-            return LogitsProcessorOutput(
+            return LogitProcessorOutput(
                 next_token_logits=last_logits,
                 next_token_logprobs=None,
                 normalized_prompt_logprobs=None,
@@ -209,7 +209,7 @@ class LogitsProcessor(nn.Module):
                 else:
                     output_top_logprobs = None
-                return LogitsProcessorOutput(
+                return LogitProcessorOutput(
                     next_token_logits=last_logits,
                     next_token_logprobs=last_logprobs,
                     normalized_prompt_logprobs=None,
@@ -278,7 +278,7 @@ class LogitsProcessor(nn.Module):
                 # Remove the last token logprob for the prefill tokens.
                 input_token_logprobs = input_token_logprobs[:-1]
-                return LogitsProcessorOutput(
+                return LogitProcessorOutput(
                     next_token_logits=last_logits,
                     next_token_logprobs=last_logprobs,
                     normalized_prompt_logprobs=normalized_prompt_logprobs,

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,6 +1,4 @@
-import dataclasses
 import logging
-from typing import Union
 import torch
 from flashinfer.sampling import (
@@ -11,8 +9,6 @@ from flashinfer.sampling import (
 )
 from vllm.model_executor.custom_op import CustomOp
-from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 # TODO: move this dict to another place
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
@@ -20,71 +16,30 @@ from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 logger = logging.getLogger(__name__)
-@dataclasses.dataclass
-class SampleOutput:
-    success: torch.Tensor
-    probs: torch.Tensor
-    batch_next_token_ids: torch.Tensor
 class Sampler(CustomOp):
     def __init__(self):
         super().__init__()
-    def _apply_penalties(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
-        # min-token, presence, frequency
-        if sampling_info.linear_penalties is not None:
-            logits += sampling_info.linear_penalties
-        # repetition
-        if sampling_info.scaling_penalties is not None:
-            logits = torch.where(
-                logits > 0,
-                logits / sampling_info.scaling_penalties,
-                logits * sampling_info.scaling_penalties,
-            )
-        return logits
-    def _get_probs(
-        self,
-        logits: torch.Tensor,
-        sampling_info: SamplingBatchInfo,
-        is_torch_compile: bool = False,
-    ):
+    def forward_cuda(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
         # Post process logits
         logits = logits.contiguous()
         logits.div_(sampling_info.temperatures)
-        if is_torch_compile:
-            # FIXME: Temporary workaround for unknown bugs in torch.compile
-            logits.add_(0)
         if sampling_info.logit_bias is not None:
             logits.add_(sampling_info.logit_bias)
         if sampling_info.vocab_mask is not None:
             logits = logits.masked_fill(~sampling_info.vocab_mask, float("-inf"))
-        logits = self._apply_penalties(logits, sampling_info)
+        logits = sampling_info.penalizer_orchestrator.apply(logits)
-        return torch.softmax(logits, dim=-1)
-    def forward_cuda(
-        self,
-        logits: Union[torch.Tensor, LogitsProcessorOutput],
-        sampling_info: SamplingBatchInfo,
-    ):
-        if isinstance(logits, LogitsProcessorOutput):
-            logits = logits.next_token_logits
-        probs = self._get_probs(logits, sampling_info)
+        probs = torch.softmax(logits, dim=-1)
         if not global_server_args_dict["disable_flashinfer_sampling"]:
             max_top_k_round, batch_size = 32, probs.shape[0]
             uniform_samples = torch.rand(
                 (max_top_k_round, batch_size), device=probs.device
             )
-            if sampling_info.need_min_p_sampling:
+            if sampling_info.min_ps.any():
                 probs = top_k_renorm_prob(probs, sampling_info.top_ks)
                 probs = top_p_renorm_prob(probs, sampling_info.top_ps)
                 batch_next_token_ids, success = min_p_sampling_from_probs(
@@ -100,23 +55,18 @@ class Sampler(CustomOp):
                 probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
             )
-        return SampleOutput(success, probs, batch_next_token_ids)
-    def forward_native(
-        self,
-        logits: Union[torch.Tensor, LogitsProcessorOutput],
-        sampling_info: SamplingBatchInfo,
-    ):
-        if isinstance(logits, LogitsProcessorOutput):
-            logits = logits.next_token_logits
-        probs = self._get_probs(logits, sampling_info, is_torch_compile=True)
+        if not torch.all(success):
+            logging.warning("Sampling failed, fallback to top_k=1 strategy")
+            probs = probs.masked_fill(torch.isnan(probs), 0.0)
+            argmax_ids = torch.argmax(probs, dim=-1)
+            batch_next_token_ids = torch.where(
+                success, batch_next_token_ids, argmax_ids
+            )
-        batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
-            probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
-        )
+        return batch_next_token_ids
-        return SampleOutput(success, probs, batch_next_token_ids)
+    def forward_native():
+        raise NotImplementedError("Native forward is not implemented yet.")
 def top_k_top_p_min_p_sampling_from_probs_torch(
@@ -137,10 +87,7 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
     probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0])
     try:
-        # FIXME: torch.multiomial does not support num_samples = 1
-        sampled_index = torch.multinomial(probs_sort, num_samples=2, replacement=True)[
-            :, :1
-        ]
+        sampled_index = torch.multinomial(probs_sort, num_samples=1)
     except RuntimeError as e:
         logger.warning(f"Sampling error: {e}")
         batch_next_token_ids = torch.zeros(

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -55,6 +55,7 @@ class GenerateReqInput:
             self.text is not None and self.input_ids is not None
         ):
             raise ValueError("Either text or input_ids should be provided.")
         if (
             isinstance(self.sampling_params, dict)
             and self.sampling_params.get("n", 1) != 1
@@ -161,10 +162,10 @@ class TokenizedGenerateReqInput:
     input_ids: List[int]
     # The pixel values for input images
     pixel_values: List[float]
-    # The hash of input images
-    image_hash: int
-    # The image size
-    image_size: List[int]
+    # The hash values of input images
+    image_hashes: List[int]
+    # The image sizes
+    image_sizes: List[List[int]]
     # The sampling parameters
     sampling_params: SamplingParams
     # Whether to return the logprobs

sglang 0.2.14__py3-none-any.whl → 0.2.14.post2__py3-none-any.whl

sglang 0.2.14py3-none-any.whl → 0.2.14.post2py3-none-any.whl