PyPI - sglang - Versions diffs - 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +48 -20
sglang/bench_one_batch.py +472 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +125 -6
sglang/check_env.py +3 -6
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +28 -17
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +47 -58
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +16 -13
sglang/srt/layers/attention/flashinfer_backend.py +106 -54
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +25 -0
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +17 -15
sglang/srt/layers/logits_processor.py +23 -25
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +98 -27
sglang/srt/managers/detokenizer_manager.py +13 -15
sglang/srt/managers/io_struct.py +63 -21
sglang/srt/managers/schedule_batch.py +154 -59
sglang/srt/managers/schedule_policy.py +18 -16
sglang/srt/managers/scheduler.py +278 -109
sglang/srt/managers/session_controller.py +61 -0
sglang/srt/managers/tokenizer_manager.py +63 -18
sglang/srt/managers/tp_worker.py +25 -16
sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +63 -25
sglang/srt/model_executor/forward_batch_info.py +128 -32
sglang/srt/model_executor/model_runner.py +132 -64
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +162 -59
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +31 -25
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +14 -16
sglang/srt/models/llavavid.py +14 -16
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +22 -20
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +107 -93
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +19 -17
sglang/srt/openai_api/protocol.py +14 -16
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +61 -57
sglang/srt/sampling/sampling_params.py +14 -16
sglang/srt/server.py +86 -35
sglang/srt/server_args.py +96 -80
sglang/srt/utils.py +266 -68
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +38 -20
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +31 -20
sglang/version.py +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
sglang-0.3.6.post1.dist-info/RECORD +164 -0
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.5.post2.dist-info/RECORD +0 -156
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/sampling/penaltylib/orchestrator.py CHANGED Viewed

@@ -1,40 +1,34 @@
 import abc
 import dataclasses
-import typing
+from typing import List, Set, Type, Union
 import torch
 @dataclasses.dataclass
 class _ReqLike:
-    origin_input_ids: typing.Union[torch.Tensor, typing.List[int]]
+    origin_input_ids: List[int]
 @dataclasses.dataclass
 class _BatchLike:
-    reqs: typing.List[_ReqLike]
+    reqs: List[_ReqLike]
     def batch_size(self):
         return len(self.reqs)
 class BatchedPenalizerOrchestrator:
-    batch: _BatchLike
-    device: str
-    vocab_size: int
-    penalizers: typing.Dict[typing.Type["_BatchedPenalizer"], "_BatchedPenalizer"]
     def __init__(
         self,
         vocab_size: int,
         batch: _BatchLike,
         device: str,
-        Penalizers: typing.Set[typing.Type["_BatchedPenalizer"]],
+        Penalizers: Set[Type["_BatchedPenalizer"]],
     ):
         self.vocab_size = vocab_size
         self.batch = batch
         self.device = device
         self.penalizers = {Penalizer: Penalizer(self) for Penalizer in Penalizers}
         is_required = False
@@ -43,10 +37,12 @@ class BatchedPenalizerOrchestrator:
             is_required |= pen_is_required
         self.is_required = is_required
+        input_ids = [
+            torch.tensor(req.origin_input_ids, dtype=torch.int64, device=self.device)
+            for req in self.reqs()
+        ]
         if self.is_required:
-            self.cumulate_input_tokens(
-                input_ids=[req.origin_input_ids for req in self.reqs()]
-            )
+            self.cumulate_input_tokens(input_ids=input_ids)
     def reqs(self):
         return self.batch.reqs
@@ -54,34 +50,24 @@ class BatchedPenalizerOrchestrator:
     def batch_size(self):
         return self.batch.batch_size()
-    def cumulate_input_tokens(
-        self,
-        input_ids: typing.Union[
-            typing.List[torch.Tensor], typing.List[typing.List[int]]
-        ],
-    ):
+    def cumulate_input_tokens(self, input_ids: List[torch.Tensor]):
         """
         Feed the input tokens to the penalizers.
         Args:
-            input_ids (typing.Union[typing.List[torch.Tensor], typing.List[typing.List[int]]]): The input tokens.
+            input_ids (List[torch.Tensor]): The input tokens.
         """
         token_ids = _TokenIDs(orchestrator=self, token_ids=input_ids)
         for penalizer in self.penalizers.values():
             penalizer.cumulate_input_tokens(input_ids=token_ids)
-    def cumulate_output_tokens(
-        self,
-        output_ids: typing.Union[
-            typing.List[torch.Tensor], typing.List[typing.List[int]]
-        ],
-    ):
+    def cumulate_output_tokens(self, output_ids: torch.Tensor):
         """
         Feed the output tokens to the penalizers.
         Args:
-            output_ids (typing.Union[typing.List[torch.Tensor], typing.List[typing.List[int]]]): The output tokens.
+            output_ids (torch.Tensor): The output tokens.
         """
         if not self.is_required:
             return
@@ -112,14 +98,14 @@ class BatchedPenalizerOrchestrator:
     def filter(
         self,
-        indices_to_keep: typing.List[int],
+        indices_to_keep: List[int],
         indices_tensor_to_keep: torch.Tensor = None,
     ):
         """
         Filter the penalizers based on the indices to keep in the batch.
         Args:
-            indices_to_keep (typing.List[int]): List of indices to keep in the batch.
+            indices_to_keep (List[int]): List of indices to keep in the batch.
             indices_tensor_to_keep (torch.Tensor = None): Tensor of indices to keep in the batch. If not None, it will be used instead of converting indices_to_keep to a tensor.
         """
         if not self.is_required:
@@ -174,32 +160,18 @@ class _TokenIDs:
     Attributes:
         orchestrator (BatchedPenalizerOrchestrator): The orchestrator that this token IDs belong to.
-        token_ids (typing.Union[torch.Tensor, typing.List[torch.Tensor]]): The token IDs.
+        token_ids (Union[torch.Tensor, List[torch.Tensor]]): The token IDs.
         cached_counts (torch.Tensor): The cached occurrence count tensor.
     """
-    orchestrator: BatchedPenalizerOrchestrator
-    token_ids: typing.Union[torch.Tensor, typing.List[torch.Tensor]]
-    cached_counts: torch.Tensor = None
     def __init__(
         self,
         orchestrator: BatchedPenalizerOrchestrator,
-        token_ids: typing.Union[
-            typing.List[torch.Tensor], typing.List[typing.List[int]]
-        ],
+        token_ids: Union[torch.Tensor, List[torch.Tensor]],
     ):
         self.orchestrator = orchestrator
-        if not isinstance(token_ids[0], torch.Tensor):
-            token_ids = [
-                torch.tensor(
-                    data=ids, dtype=torch.int64, device=self.orchestrator.device
-                )
-                for ids in token_ids
-            ]
         self.token_ids = token_ids
+        self.cached_counts = None
     def occurrence_count(self) -> torch.Tensor:
         """
@@ -213,30 +185,34 @@ class _TokenIDs:
         token_ids = self.token_ids
-        if isinstance(token_ids, torch.Tensor):
-            token_ids = token_ids.unsqueeze(1)
-            # needs to be long to be used as index in scatter_add
-            if token_ids.dtype != torch.int64:
-                token_ids = token_ids.to(torch.int64)
-        padded_token_ids = torch.nn.utils.rnn.pad_sequence(
-            sequences=token_ids,
-            batch_first=True,
-            padding_value=self.orchestrator.vocab_size,
-        )
-        self.cached_counts = torch.zeros(
-            size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size + 1),
-            dtype=torch.int64,
-            device=self.orchestrator.device,
-        ).scatter_add_(
-            dim=1,
-            index=padded_token_ids,
-            src=torch.ones_like(padded_token_ids),
-        )[
-            :, : self.orchestrator.vocab_size
-        ]
+        if isinstance(token_ids, list):
+            # TODO: optimize this part
+            padded_token_ids = torch.nn.utils.rnn.pad_sequence(
+                sequences=token_ids,
+                batch_first=True,
+                padding_value=self.orchestrator.vocab_size,
+            )
+            self.cached_counts = torch.zeros(
+                size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size + 1),
+                dtype=torch.int64,
+                device=self.orchestrator.device,
+            ).scatter_add_(
+                dim=1,
+                index=padded_token_ids,
+                src=torch.ones_like(padded_token_ids),
+            )[
+                :, : self.orchestrator.vocab_size
+            ]
+        else:
+            # TODO: optimize this part. We do not need to create this big tensor every time.
+            # We can directly apply the results on the logits.
+            self.cached_counts = torch.zeros(
+                size=(self.orchestrator.batch_size(), self.orchestrator.vocab_size),
+                device=self.orchestrator.device,
+            )
+            self.cached_counts[
+                torch.arange(len(token_ids), device=self.orchestrator.device), token_ids
+            ] = 1
         return self.cached_counts
@@ -246,11 +222,9 @@ class _BatchedPenalizer(abc.ABC):
     An abstract class for a batched penalizer.
     """
-    orchestrator: BatchedPenalizerOrchestrator
-    _is_prepared: bool = False
     def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
         self.orchestrator = orchestrator
+        self._is_prepared = False
     def is_prepared(self) -> bool:
         return self._is_prepared
@@ -293,9 +267,7 @@ class _BatchedPenalizer(abc.ABC):
         return self._apply(logits=logits)
-    def filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         if not self.is_prepared():
             return
@@ -360,9 +332,7 @@ class _BatchedPenalizer(abc.ABC):
         pass
     @abc.abstractmethod
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         """
         Filter the penalizer (tensors or underlying data) based on the indices to keep in the batch.
         """

sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import typing
+from typing import List
 import torch
-from ..orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
 class BatchedFrequencyPenalizer(_BatchedPenalizer):
@@ -44,9 +44,6 @@ class BatchedFrequencyPenalizer(_BatchedPenalizer):
         )
     def _teardown(self):
-        del self.frequency_penalties
-        del self.cumulated_frequency_penalties
         self.frequency_penalties = None
         self.cumulated_frequency_penalties = None
@@ -62,9 +59,7 @@ class BatchedFrequencyPenalizer(_BatchedPenalizer):
         logits -= self.cumulated_frequency_penalties
         return logits
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.frequency_penalties = self.frequency_penalties[indices_tensor_to_keep]
         self.cumulated_frequency_penalties = self.cumulated_frequency_penalties[
             indices_tensor_to_keep

sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import typing
+from typing import List
 import torch
-from ..orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
 class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
@@ -70,10 +70,6 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
         )
     def _teardown(self):
-        del self.min_new_tokens
-        del self.stop_token_penalties
-        del self.len_output_tokens
         self.min_new_tokens = None
         self.stop_token_penalties = None
         self.len_output_tokens = None
@@ -89,9 +85,7 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
         logits[mask] += self.stop_token_penalties[mask]
         return logits
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.min_new_tokens = self.min_new_tokens[indices_tensor_to_keep]
         self.stop_token_penalties = self.stop_token_penalties[indices_tensor_to_keep]
         self.len_output_tokens = self.len_output_tokens[indices_tensor_to_keep]

sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import typing
+from typing import List
 import torch
-from ..orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
 class BatchedPresencePenalizer(_BatchedPenalizer):
@@ -44,9 +44,6 @@ class BatchedPresencePenalizer(_BatchedPenalizer):
         )
     def _teardown(self):
-        del self.presence_penalties
-        del self.cumulated_presence_penalties
         self.presence_penalties = None
         self.cumulated_presence_penalties = None
@@ -61,9 +58,7 @@ class BatchedPresencePenalizer(_BatchedPenalizer):
         logits -= self.cumulated_presence_penalties
         return logits
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.presence_penalties = self.presence_penalties[indices_tensor_to_keep]
         self.cumulated_presence_penalties = self.cumulated_presence_penalties[
             indices_tensor_to_keep

sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import typing
+from typing import List
 import torch
-from ..orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -44,9 +44,6 @@ class BatchedRepetitionPenalizer(_BatchedPenalizer):
         )
     def _teardown(self):
-        del self.repetition_penalties
-        del self.cumulated_repetition_penalties
         self.repetition_penalties = None
         self.cumulated_repetition_penalties = None
@@ -65,9 +62,7 @@ class BatchedRepetitionPenalizer(_BatchedPenalizer):
             logits * self.cumulated_repetition_penalties,
         )
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]
         self.cumulated_repetition_penalties = self.cumulated_repetition_penalties[
             indices_tensor_to_keep

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from __future__ import annotations
 import dataclasses
-from typing import TYPE_CHECKING, List, Optional
+import logging
+import threading
+from typing import TYPE_CHECKING, Callable, List, Optional
 import torch
 import sglang.srt.sampling.penaltylib as penaltylib
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import ScheduleBatch
@@ -27,10 +32,11 @@ class SamplingBatchInfo:
     # Bias Tensors
     vocab_size: int
+    grammars: Optional[List] = None
+    sampling_info_done: Optional[threading.Event] = None
     logit_bias: torch.Tensor = None
     vocab_mask: Optional[torch.Tensor] = None
-    grammars: Optional[List] = None
+    apply_mask: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
     # Penalizer
     penalizer_orchestrator: Optional[penaltylib.BatchedPenalizerOrchestrator] = None
@@ -42,10 +48,7 @@ class SamplingBatchInfo:
     @classmethod
     def from_schedule_batch(
-        cls,
-        batch: ScheduleBatch,
-        vocab_size: int,
-        disable_penalizer: bool,
+        cls, batch: ScheduleBatch, vocab_size: int, enable_overlap_schedule: bool
     ):
         reqs = batch.reqs
         device = batch.device
@@ -73,12 +76,39 @@ class SamplingBatchInfo:
             top_ks=top_ks,
             min_ps=min_ps,
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
-            is_all_greedy=top_ks.max().item() <= 1,
+            is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
             vocab_size=vocab_size,
             device=device,
         )
         # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
+        if enable_overlap_schedule:
+            # TODO (lianmin): Some penalizers such as frequency and presence depend on model outputs,
+            # so it is kind of tricky to make it work with overlap scheduler.
+            # It requires correcly updating the penalty logits before the sampling and syncing the events.
+            # We will support them later.
+            penalizers = {
+                penaltylib.BatchedMinNewTokensPenalizer,
+            }
+            if (
+                any(req.sampling_params.frequency_penalty != 0.0 for req in reqs)
+                or any(req.sampling_params.presence_penalty != 0.0 for req in reqs)
+                or any(req.sampling_params.repetition_penalty != 1.0 for req in reqs)
+            ):
+                logger.warning(
+                    "frequency_penalty, presence_penalty, and repetition_penalty are not supported "
+                    "when using the default overlap scheduler. They will be ignored. "
+                    "Please add `--disable-overlap` when launching the server if you need these features. "
+                    "The speed will be slower in that case."
+                )
+        else:
+            penalizers = {
+                penaltylib.BatchedFrequencyPenalizer,
+                penaltylib.BatchedMinNewTokensPenalizer,
+                penaltylib.BatchedPresencePenalizer,
+                penaltylib.BatchedRepetitionPenalizer,
+            }
         # Each penalizers will do nothing if they evaluate themselves as not required by looking at
         # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
         # should not add hefty computation overhead other than simple checks.
@@ -86,20 +116,12 @@ class SamplingBatchInfo:
         # While we choose not to even create the class instances if they are not required, this
         # could add additional complexity to the {ScheduleBatch} class, especially we need to
         # handle {filter_batch()} and {merge_batch()} cases as well.
-        if disable_penalizer:
-            ret.penalizer_orchestrator = None
-        else:
-            ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
-                vocab_size=vocab_size,
-                batch=batch,
-                device=batch.device,
-                Penalizers={
-                    penaltylib.BatchedFrequencyPenalizer,
-                    penaltylib.BatchedMinNewTokensPenalizer,
-                    penaltylib.BatchedPresencePenalizer,
-                    penaltylib.BatchedRepetitionPenalizer,
-                },
-            )
+        ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
+            vocab_size=vocab_size,
+            batch=batch,
+            device=batch.device,
+            Penalizers=penalizers,
+        )
         # Handle logit bias but only allocate when needed
         ret.logit_bias = None
@@ -110,9 +132,6 @@ class SamplingBatchInfo:
         return len(self.temperatures)
     def update_penalties(self):
-        if not self.penalizer_orchestrator:
-            return
         self.scaling_penalties = None
         self.linear_penalties = None
@@ -133,23 +152,31 @@ class SamplingBatchInfo:
                 self.linear_penalties = penalizer.apply(self.linear_penalties)
     def update_regex_vocab_mask(self):
-        if not self.grammars or not any(grammar for grammar in self.grammars):
+        if not self.grammars:
             self.vocab_mask = None
+            self.apply_mask = None
             return
-        self.vocab_mask = torch.zeros(
-            len(self.temperatures),
-            self.vocab_size,
-            dtype=torch.bool,
+        # find a grammar from the list
+        grammar = next(grammar for grammar in self.grammars if grammar)
+        # maybe we can reuse the existing mask?
+        self.vocab_mask = grammar.allocate_vocab_mask(
+            vocab_size=self.vocab_size,
+            batch_size=len(self.temperatures),
             device=self.device,
         )
+        self.apply_mask = type(grammar).apply_vocab_mask  # force to use static method
         for i, grammar in enumerate(self.grammars):
             if grammar is not None:
-                grammar.fill_vocab_mask(self.vocab_mask[i])
+                try:
+                    grammar.fill_vocab_mask(self.vocab_mask, i)
+                except RuntimeError:
+                    continue
     def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
-        if self.penalizer_orchestrator:
-            self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
+        self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
         for item in [
             "temperatures",
@@ -188,8 +215,7 @@ class SamplingBatchInfo:
         return None
     def merge_batch(self, other: "SamplingBatchInfo"):
-        if self.penalizer_orchestrator:
-            self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+        self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
         for item in [
             "temperatures",
@@ -205,25 +231,3 @@ class SamplingBatchInfo:
         self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
             self.logit_bias, other.logit_bias, len(self), len(other), self.device
         )
-    def copy(self):
-        return SamplingBatchInfo(
-            temperatures=self.temperatures,
-            top_ps=self.top_ps,
-            top_ks=self.top_ks,
-            min_ps=self.min_ps,
-            is_all_greedy=self.is_all_greedy,
-            need_min_p_sampling=self.need_min_p_sampling,
-            vocab_size=self.vocab_size,
-            device=self.device,
-        )
-    def to(self, device: str):
-        for item in [
-            "temperatures",
-            "top_ps",
-            "top_ks",
-            "min_ps",
-        ]:
-            value = getattr(self, item)
-            setattr(self, item, value.to(device, non_blocking=True))

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Sampling parameters for text generation."""
 from typing import List, Optional, Union
@@ -24,7 +22,6 @@ class SamplingParams:
     def __init__(
         self,
         max_new_tokens: int = 128,
-        min_new_tokens: int = 0,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
         temperature: float = 1.0,
@@ -34,6 +31,7 @@ class SamplingParams:
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repetition_penalty: float = 1.0,
+        min_new_tokens: int = 0,
         spaces_between_special_tokens: bool = True,
         regex: Optional[str] = None,
         n: int = 1,

sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl