PyPI - sglang - Versions diffs - 0.2.14.post1__py3-none-any.whl → 0.2.15__py3-none-any.whl - Mend

sglang 0.2.14.post1py3-none-any.whl → 0.2.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sglang/api.py +2 -0
sglang/bench_latency.py +39 -28
sglang/lang/interpreter.py +3 -0
sglang/lang/ir.py +5 -0
sglang/launch_server_llavavid.py +26 -0
sglang/srt/configs/__init__.py +5 -0
sglang/srt/configs/exaone.py +195 -0
sglang/srt/constrained/fsm_cache.py +1 -1
sglang/srt/conversation.py +24 -2
sglang/srt/hf_transformers_utils.py +11 -160
sglang/srt/layers/activation.py +10 -4
sglang/srt/layers/extend_attention.py +13 -8
sglang/srt/layers/layernorm.py +47 -1
sglang/srt/layers/logits_processor.py +4 -4
sglang/srt/layers/sampler.py +69 -16
sglang/srt/managers/controller_multi.py +5 -5
sglang/srt/managers/controller_single.py +5 -5
sglang/srt/managers/io_struct.py +11 -5
sglang/srt/managers/schedule_batch.py +25 -13
sglang/srt/managers/tokenizer_manager.py +76 -63
sglang/srt/managers/tp_worker.py +47 -36
sglang/srt/model_config.py +3 -3
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +78 -43
sglang/srt/model_executor/model_runner.py +29 -18
sglang/srt/models/chatglm.py +5 -13
sglang/srt/models/commandr.py +5 -1
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +5 -1
sglang/srt/models/deepseek_v2.py +57 -25
sglang/srt/models/exaone.py +399 -0
sglang/srt/models/gemma.py +7 -3
sglang/srt/models/gemma2.py +6 -52
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +14 -4
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +10 -7
sglang/srt/models/llama_classification.py +2 -6
sglang/srt/models/llama_embedding.py +3 -4
sglang/srt/models/llava.py +69 -91
sglang/srt/models/llavavid.py +40 -86
sglang/srt/models/minicpm.py +5 -1
sglang/srt/models/mixtral.py +6 -2
sglang/srt/models/mixtral_quant.py +5 -1
sglang/srt/models/qwen.py +5 -2
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_moe.py +12 -33
sglang/srt/models/stablelm.py +5 -1
sglang/srt/models/yivl.py +2 -7
sglang/srt/openai_api/adapter.py +16 -1
sglang/srt/openai_api/protocol.py +5 -5
sglang/srt/sampling/sampling_batch_info.py +79 -6
sglang/srt/server.py +9 -9
sglang/srt/utils.py +18 -36
sglang/test/runners.py +2 -2
sglang/test/test_layernorm.py +53 -1
sglang/version.py +1 -1
{sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/METADATA +8 -8
sglang-0.2.15.dist-info/RECORD +118 -0
sglang-0.2.14.post1.dist-info/RECORD +0 -114
{sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/LICENSE +0 -0
{sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/WHEEL +0 -0
{sglang-0.2.14.post1.dist-info → sglang-0.2.15.dist-info}/top_level.txt +0 -0

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -15,6 +15,7 @@ limitations under the License.
 """Utilities for Huggingface Transformers."""
+import contextlib
 import functools
 import json
 import os
@@ -34,15 +35,20 @@ from transformers import (
 try:
     from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
+    from sglang.srt.configs import ExaoneConfig
     _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
         ChatGLMConfig.model_type: ChatGLMConfig,
         DbrxConfig.model_type: DbrxConfig,
+        ExaoneConfig.model_type: ExaoneConfig,
     }
 except ImportError:
     # We want this file to run without vllm dependency
     _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {}
-from sglang.srt.utils import is_multimodal_model
+for name, cls in _CONFIG_REGISTRY.items():
+    with contextlib.suppress(ValueError):
+        AutoConfig.register(name, cls)
 def download_from_hf(model_path: str):
@@ -52,17 +58,11 @@ def download_from_hf(model_path: str):
     return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"])
-def get_config_json(model_path: str):
-    with open(os.path.join(model_path, "config.json")) as f:
-        config = json.load(f)
-    return config
 def get_config(
     model: str,
     trust_remote_code: bool,
     revision: Optional[str] = None,
-    model_overide_args: Optional[dict] = None,
+    model_override_args: Optional[dict] = None,
 ):
     config = AutoConfig.from_pretrained(
         model, trust_remote_code=trust_remote_code, revision=revision
@@ -70,8 +70,8 @@ def get_config(
     if config.model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[config.model_type]
         config = config_class.from_pretrained(model, revision=revision)
-    if model_overide_args:
-        config.update(model_overide_args)
+    if model_override_args:
+        config.update(model_override_args)
     return config
@@ -89,7 +89,7 @@ CONTEXT_LENGTH_KEYS = [
 def get_context_length(config):
-    """Get the context length of a model from a huggingface model config."""
+    """Get the context length of a model from a huggingface model configs."""
     rope_scaling = getattr(config, "rope_scaling", None)
     if rope_scaling:
         rope_scaling_factor = config.rope_scaling["factor"]
@@ -119,24 +119,7 @@ def get_tokenizer(
     tokenizer_revision: Optional[str] = None,
     **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-    if tokenizer_name.endswith(".json"):
-        return TiktokenTokenizer(tokenizer_name)
-    if tokenizer_name.endswith(".model"):
-        return SentencePieceTokenizer(tokenizer_name)
     """Gets a tokenizer for the given model name via Huggingface."""
-    if is_multimodal_model(tokenizer_name):
-        processor = get_processor(
-            tokenizer_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            tokenizer_revision=tokenizer_revision,
-            **kwargs,
-        )
-        tokenizer = processor.tokenizer
-        return tokenizer
     if tokenizer_mode == "slow":
         if kwargs.get("use_fast", False):
             raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
@@ -199,135 +182,3 @@ def get_processor(
         **kwargs,
     )
     return processor
-class TiktokenTokenizer:
-    def __init__(self, tokenizer_path):
-        import tiktoken
-        from jinja2 import Template
-        PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
-        # Read JSON
-        name = "tmp-json"
-        with open(tokenizer_path, "rb") as fin:
-            tok_dict = json.load(fin)
-        mergeable_ranks = {
-            bytes(item["bytes"]): item["token"] for item in tok_dict["regular_tokens"]
-        }
-        special_tokens = {
-            bytes(item["bytes"]).decode(): item["token"]
-            for item in tok_dict["special_tokens"]
-        }
-        assert tok_dict["word_split"] == "V1"
-        default_allowed_special = None
-        kwargs = {
-            "name": name,
-            "pat_str": tok_dict.get("pat_str", PAT_STR_B),
-            "mergeable_ranks": mergeable_ranks,
-            "special_tokens": special_tokens,
-        }
-        if "default_allowed_special" in tok_dict:
-            default_allowed_special = set(
-                [
-                    bytes(bytes_list).decode()
-                    for bytes_list in tok_dict["default_allowed_special"]
-                ]
-            )
-        if "vocab_size" in tok_dict:
-            kwargs["explicit_n_vocab"] = tok_dict["vocab_size"]
-        PAD = "<|pad|>"
-        EOS = "<|eos|>"
-        SEP = "<|separator|>"
-        DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP}
-        tokenizer = tiktoken.Encoding(**kwargs)
-        tokenizer._default_allowed_special = default_allowed_special or set()
-        tokenizer._control_tokens = DEFAULT_CONTROL_TOKENS
-        def encode_patched(
-            self,
-            text: str,
-            *,
-            allowed_special: Union[
-                Literal["all"], AbstractSet[str]
-            ] = set(),  # noqa: B006
-            disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-        ) -> List[int]:
-            if isinstance(allowed_special, set):
-                allowed_special |= self._default_allowed_special
-            return tiktoken.Encoding.encode(
-                self,
-                text,
-                allowed_special=allowed_special,
-                disallowed_special=(),
-            )
-        tokenizer.encode = functools.partial(encode_patched, tokenizer)
-        # Convert to HF interface
-        self.tokenizer = tokenizer
-        self.eos_token_id = tokenizer._special_tokens[EOS]
-        self.vocab_size = tokenizer.n_vocab
-        self.chat_template = Template(
-            "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: '  + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
-        )
-    def encode(self, x, add_special_tokens=False):
-        return self.tokenizer.encode(x)
-    def decode(self, x):
-        return self.tokenizer.decode(x)
-    def batch_decode(
-        self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
-    ):
-        if isinstance(batch[0], int):
-            batch = [[x] for x in batch]
-        return self.tokenizer.decode_batch(batch)
-    def apply_chat_template(self, messages, tokenize, add_generation_prompt):
-        ret = self.chat_template.render(
-            messages=messages, add_generation_prompt=add_generation_prompt
-        )
-        return self.encode(ret) if tokenize else ret
-class SentencePieceTokenizer:
-    def __init__(self, tokenizer_path):
-        import sentencepiece as spm
-        from jinja2 import Template
-        tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
-        # Convert to HF interface
-        self.tokenizer = tokenizer
-        self.eos_token_id = tokenizer.eos_id()
-        self.vocab_size = tokenizer.vocab_size()
-        self.chat_template = Template(
-            "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: '  + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
-        )
-    def encode(self, x, add_special_tokens=False):
-        return self.tokenizer.encode(x)
-    def decode(self, x):
-        return self.tokenizer.decode(x)
-    def batch_decode(
-        self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
-    ):
-        if isinstance(batch[0], int):
-            batch = [[x] for x in batch]
-        return self.tokenizer.decode(batch)
-    def apply_chat_template(self, messages, tokenize, add_generation_prompt):
-        ret = self.chat_template.render(
-            messages=messages, add_generation_prompt=add_generation_prompt
-        )
-        return self.encode(ret) if tokenize else ret

sglang/srt/layers/activation.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from flashinfer.activation import gelu_tanh_and_mul, silu_and_mul
+from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 from vllm.distributed import (
     divide,
     get_tensor_model_parallel_rank,
@@ -43,18 +43,24 @@ class SiluAndMul(CustomOp):
 class GeluAndMul(CustomOp):
-    def __init__(self, **kwargs):
+    def __init__(self, approximate="tanh"):
         super().__init__()
+        self.approximate = approximate
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
-        return F.gelu(x[..., :d], approximate="tanh") * x[..., d:]
+        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
         output_shape = x.shape[:-1] + (d,)
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        gelu_tanh_and_mul(x, out)
+        if self.approximate == "tanh":
+            gelu_tanh_and_mul(x, out)
+        elif self.approximate == "none":
+            gelu_and_mul(x, out)
+        else:
+            raise RuntimeError("GeluAndMul only support tanh or none")
         return out

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -127,8 +127,7 @@ def _fwd_kernel(
         )
         k = tl.load(K_Buffer + offs_buf_k, mask=mask_n[None, :], other=0.0)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
+        qk = tl.dot(q.to(k.dtype), k)
         if BLOCK_DPE > 0:
             offs_kpe = (
                 offs_kv_loc[None, :] * stride_buf_kbs
@@ -140,7 +139,7 @@ def _fwd_kernel(
                 mask=mask_n[None, :],
                 other=0.0,
             )
-            qk += tl.dot(qpe, kpe)
+            qk += tl.dot(qpe.to(kpe.dtype), kpe)
         qk *= sm_scale
         if logit_cap > 0:
@@ -179,9 +178,7 @@ def _fwd_kernel(
         )
         k = tl.load(K_Extend + offs_k, mask=mask_n[None, :], other=0.0)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k)
+        qk = tl.dot(q, k, out_dtype=tl.float32)
         if BLOCK_DPE > 0:
             offs_kpe = (
                 (cur_seq_extend_start_contiguous + start_n + offs_n[None, :])
@@ -276,9 +273,17 @@ def extend_attention_fwd(
     BLOCK_DV = Lv
     if CUDA_CAPABILITY[0] >= 9:
-        BLOCK_M, BLOCK_N = (128, 64)
+        if Lq <= 256:
+            BLOCK_M, BLOCK_N = (128, 64)
+        else:
+            BLOCK_M, BLOCK_N = (32, 64)
     elif CUDA_CAPABILITY[0] >= 8:
-        BLOCK_M, BLOCK_N = (128, 128) if Lq <= 128 else (64, 64)
+        if Lq <= 128:
+            BLOCK_M, BLOCK_N = (128, 128)
+        elif Lq <= 256:
+            BLOCK_M, BLOCK_N = (64, 64)
+        else:
+            BLOCK_M, BLOCK_N = (32, 64)
     else:
         BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -19,7 +19,12 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from flashinfer.norm import (
+    fused_add_rmsnorm,
+    gemma_fused_add_rmsnorm,
+    gemma_rmsnorm,
+    rmsnorm,
+)
 from vllm.model_executor.custom_op import CustomOp
@@ -63,3 +68,44 @@ class RMSNorm(CustomOp):
             return x
         else:
             return x, residual
+class GemmaRMSNorm(CustomOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        if residual is not None:
+            x = x + residual
+            residual = x
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x * (1.0 + self.weight.float())
+        x = x.to(orig_dtype)
+        return x if residual is None else (x, residual)
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            gemma_fused_add_rmsnorm(
+                x, residual, self.weight.data, self.variance_epsilon
+            )
+            return x, residual
+        out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -29,7 +29,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetad
 @dataclasses.dataclass
-class LogitProcessorOutput:
+class LogitsProcessorOutput:
     # The logits of the next tokens.       shape: [#seq, vocab_size]
     next_token_logits: torch.Tensor
     # The logprobs of the next tokens.     shape: [#seq, vocab_size]
@@ -185,7 +185,7 @@ class LogitsProcessor(nn.Module):
         # Return only last_logits if logprob is not requested
         if not logits_metadata.return_logprob:
-            return LogitProcessorOutput(
+            return LogitsProcessorOutput(
                 next_token_logits=last_logits,
                 next_token_logprobs=None,
                 normalized_prompt_logprobs=None,
@@ -209,7 +209,7 @@ class LogitsProcessor(nn.Module):
                 else:
                     output_top_logprobs = None
-                return LogitProcessorOutput(
+                return LogitsProcessorOutput(
                     next_token_logits=last_logits,
                     next_token_logprobs=last_logprobs,
                     normalized_prompt_logprobs=None,
@@ -278,7 +278,7 @@ class LogitsProcessor(nn.Module):
                 # Remove the last token logprob for the prefill tokens.
                 input_token_logprobs = input_token_logprobs[:-1]
-                return LogitProcessorOutput(
+                return LogitsProcessorOutput(
                     next_token_logits=last_logits,
                     next_token_logprobs=last_logprobs,
                     normalized_prompt_logprobs=normalized_prompt_logprobs,

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,4 +1,6 @@
+import dataclasses
 import logging
+from typing import Union
 import torch
 from flashinfer.sampling import (
@@ -9,6 +11,8 @@ from flashinfer.sampling import (
 )
 from vllm.model_executor.custom_op import CustomOp
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 # TODO: move this dict to another place
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
@@ -16,30 +20,71 @@ from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 logger = logging.getLogger(__name__)
+@dataclasses.dataclass
+class SampleOutput:
+    success: torch.Tensor
+    probs: torch.Tensor
+    batch_next_token_ids: torch.Tensor
 class Sampler(CustomOp):
     def __init__(self):
         super().__init__()
-    def forward_cuda(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
+    def _apply_penalties(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
+        # min-token, presence, frequency
+        if sampling_info.linear_penalties is not None:
+            logits += sampling_info.linear_penalties
+        # repetition
+        if sampling_info.scaling_penalties is not None:
+            logits = torch.where(
+                logits > 0,
+                logits / sampling_info.scaling_penalties,
+                logits * sampling_info.scaling_penalties,
+            )
+        return logits
+    def _get_probs(
+        self,
+        logits: torch.Tensor,
+        sampling_info: SamplingBatchInfo,
+        is_torch_compile: bool = False,
+    ):
         # Post process logits
         logits = logits.contiguous()
         logits.div_(sampling_info.temperatures)
+        if is_torch_compile:
+            # FIXME: Temporary workaround for unknown bugs in torch.compile
+            logits.add_(0)
         if sampling_info.logit_bias is not None:
             logits.add_(sampling_info.logit_bias)
         if sampling_info.vocab_mask is not None:
-            logits = logits.masked_fill(~sampling_info.vocab_mask, float("-inf"))
+            logits = logits.masked_fill(sampling_info.vocab_mask, float("-inf"))
-        logits = sampling_info.penalizer_orchestrator.apply(logits)
+        logits = self._apply_penalties(logits, sampling_info)
-        probs = torch.softmax(logits, dim=-1)
+        return torch.softmax(logits, dim=-1)
+    def forward_cuda(
+        self,
+        logits: Union[torch.Tensor, LogitsProcessorOutput],
+        sampling_info: SamplingBatchInfo,
+    ):
+        if isinstance(logits, LogitsProcessorOutput):
+            logits = logits.next_token_logits
+        probs = self._get_probs(logits, sampling_info)
         if not global_server_args_dict["disable_flashinfer_sampling"]:
             max_top_k_round, batch_size = 32, probs.shape[0]
             uniform_samples = torch.rand(
                 (max_top_k_round, batch_size), device=probs.device
             )
-            if sampling_info.min_ps.any():
+            if sampling_info.need_min_p_sampling:
                 probs = top_k_renorm_prob(probs, sampling_info.top_ks)
                 probs = top_p_renorm_prob(probs, sampling_info.top_ps)
                 batch_next_token_ids, success = min_p_sampling_from_probs(
@@ -55,18 +100,23 @@ class Sampler(CustomOp):
                 probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
             )
-        if not torch.all(success):
-            logging.warning("Sampling failed, fallback to top_k=1 strategy")
-            probs = probs.masked_fill(torch.isnan(probs), 0.0)
-            argmax_ids = torch.argmax(probs, dim=-1)
-            batch_next_token_ids = torch.where(
-                success, batch_next_token_ids, argmax_ids
-            )
+        return SampleOutput(success, probs, batch_next_token_ids)
-        return batch_next_token_ids
+    def forward_native(
+        self,
+        logits: Union[torch.Tensor, LogitsProcessorOutput],
+        sampling_info: SamplingBatchInfo,
+    ):
+        if isinstance(logits, LogitsProcessorOutput):
+            logits = logits.next_token_logits
+        probs = self._get_probs(logits, sampling_info, is_torch_compile=True)
+        batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
+            probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
+        )
-    def forward_native():
-        raise NotImplementedError("Native forward is not implemented yet.")
+        return SampleOutput(success, probs, batch_next_token_ids)
 def top_k_top_p_min_p_sampling_from_probs_torch(
@@ -87,7 +137,10 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
     probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0])
     try:
-        sampled_index = torch.multinomial(probs_sort, num_samples=1)
+        # FIXME: torch.multiomial does not support num_samples = 1
+        sampled_index = torch.multinomial(probs_sort, num_samples=2, replacement=True)[
+            :, :1
+        ]
     except RuntimeError as e:
         logger.warning(f"Sampling error: {e}")
         batch_next_token_ids = torch.zeros(

sglang/srt/managers/controller_multi.py CHANGED Viewed

@@ -71,12 +71,12 @@ class ControllerMulti:
         self,
         server_args: ServerArgs,
         port_args: PortArgs,
-        model_overide_args,
+        model_override_args,
     ):
         # Parse args
         self.server_args = server_args
         self.port_args = port_args
-        self.model_overide_args = model_overide_args
+        self.model_override_args = model_override_args
         self.load_balance_method = LoadBalanceMethod.from_str(
             server_args.load_balance_method
         )
@@ -114,7 +114,7 @@ class ControllerMulti:
                 self.server_args,
                 self.port_args,
                 pipe_controller_writer,
-                self.model_overide_args,
+                self.model_override_args,
                 True,
                 gpu_ids,
                 dp_worker_id,
@@ -189,14 +189,14 @@ def start_controller_process(
     server_args: ServerArgs,
     port_args: PortArgs,
     pipe_writer,
-    model_overide_args: dict,
+    model_override_args: dict,
 ):
     """Start a controller process."""
     configure_logger(server_args)
     try:
-        controller = ControllerMulti(server_args, port_args, model_overide_args)
+        controller = ControllerMulti(server_args, port_args, model_override_args)
     except Exception:
         pipe_writer.send(get_exception_traceback())
         raise

sglang/srt/managers/controller_single.py CHANGED Viewed

@@ -40,7 +40,7 @@ class ControllerSingle:
         self,
         server_args: ServerArgs,
         port_args: PortArgs,
-        model_overide_args: dict,
+        model_override_args: dict,
         gpu_ids: List[int],
         is_data_parallel_worker: bool,
         dp_worker_id: int,
@@ -76,7 +76,7 @@ class ControllerSingle:
                 tp_rank_range,
                 server_args,
                 port_args.nccl_ports[dp_worker_id],
-                model_overide_args,
+                model_override_args,
             )
         # Launch tp rank 0
@@ -85,7 +85,7 @@ class ControllerSingle:
             0,
             server_args,
             port_args.nccl_ports[dp_worker_id],
-            model_overide_args,
+            model_override_args,
         )
         self.tp_cpu_group = self.tp_server.model_runner.tp_group.cpu_group
@@ -126,7 +126,7 @@ def start_controller_process(
     server_args: ServerArgs,
     port_args: PortArgs,
     pipe_writer: multiprocessing.connection.Connection,
-    model_overide_args: dict,
+    model_override_args: dict,
     is_data_parallel_worker: bool = False,
     gpu_ids: List[int] = None,
     dp_worker_id: int = None,
@@ -149,7 +149,7 @@ def start_controller_process(
         controller = ControllerSingle(
             server_args,
             port_args,
-            model_overide_args,
+            model_override_args,
             gpu_ids,
             is_data_parallel_worker,
             dp_worker_id,

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -18,8 +18,9 @@ The definition of objects transfered between different
 processes (TokenizerManager, DetokenizerManager, Controller).
 """
+import copy
 import uuid
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
@@ -55,6 +56,7 @@ class GenerateReqInput:
             self.text is not None and self.input_ids is not None
         ):
             raise ValueError("Either text or input_ids should be provided.")
         if (
             isinstance(self.sampling_params, dict)
             and self.sampling_params.get("n", 1) != 1
@@ -161,10 +163,10 @@ class TokenizedGenerateReqInput:
     input_ids: List[int]
     # The pixel values for input images
     pixel_values: List[float]
-    # The hash of input images
-    image_hash: int
-    # The image size
-    image_size: List[int]
+    # The hash values of input images
+    image_hashes: List[int]
+    # The image sizes
+    image_sizes: List[List[int]]
     # The sampling parameters
     sampling_params: SamplingParams
     # Whether to return the logprobs
@@ -248,6 +250,10 @@ class BatchTokenIDOut:
     meta_info: List[Dict]
     finished_reason: List[BaseFinishReason]
+    def __post_init__(self):
+        # deepcopy meta_info to avoid modification in place
+        self.meta_info = copy.deepcopy(self.meta_info)
 @dataclass
 class BatchStrOut:

sglang 0.2.14.post1__py3-none-any.whl → 0.2.15__py3-none-any.whl

sglang 0.2.14.post1py3-none-any.whl → 0.2.15py3-none-any.whl