PyPI - sglang - Versions diffs - 0.3.5.post1__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

sglang 0.3.5.post1py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +337 -0
sglang/bench_one_batch.py +474 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +115 -31
sglang/check_env.py +3 -6
sglang/srt/constrained/base_grammar_backend.py +4 -3
sglang/srt/constrained/outlines_backend.py +39 -26
sglang/srt/constrained/xgrammar_backend.py +58 -14
sglang/srt/layers/activation.py +3 -0
sglang/srt/layers/attention/flashinfer_backend.py +93 -48
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/custom_op_util.py +26 -0
sglang/srt/layers/fused_moe/fused_moe.py +11 -4
sglang/srt/layers/fused_moe/patch.py +4 -2
sglang/srt/layers/layernorm.py +4 -0
sglang/srt/layers/logits_processor.py +10 -10
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/managers/data_parallel_controller.py +74 -9
sglang/srt/managers/detokenizer_manager.py +1 -14
sglang/srt/managers/io_struct.py +27 -0
sglang/srt/managers/schedule_batch.py +104 -38
sglang/srt/managers/schedule_policy.py +5 -1
sglang/srt/managers/scheduler.py +210 -56
sglang/srt/managers/session_controller.py +62 -0
sglang/srt/managers/tokenizer_manager.py +38 -0
sglang/srt/managers/tp_worker.py +12 -1
sglang/srt/managers/tp_worker_overlap_thread.py +49 -52
sglang/srt/model_executor/cuda_graph_runner.py +43 -6
sglang/srt/model_executor/forward_batch_info.py +109 -15
sglang/srt/model_executor/model_runner.py +102 -43
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/deepseek_v2.py +147 -44
sglang/srt/models/gemma2.py +9 -8
sglang/srt/models/llava.py +1 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/torch_native_llama.py +94 -78
sglang/srt/openai_api/adapter.py +11 -4
sglang/srt/openai_api/protocol.py +30 -27
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +58 -57
sglang/srt/sampling/sampling_params.py +3 -3
sglang/srt/server.py +29 -2
sglang/srt/server_args.py +97 -60
sglang/srt/utils.py +103 -51
sglang/test/runners.py +25 -6
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +33 -22
sglang/version.py +1 -1
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/METADATA +43 -43
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/RECORD +62 -56
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/WHEEL +1 -1
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/LICENSE +0 -0
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/top_level.txt +0 -0

sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import typing
+from typing import List
 import torch
-from ..orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
 class BatchedFrequencyPenalizer(_BatchedPenalizer):
@@ -44,9 +44,6 @@ class BatchedFrequencyPenalizer(_BatchedPenalizer):
         )
     def _teardown(self):
-        del self.frequency_penalties
-        del self.cumulated_frequency_penalties
         self.frequency_penalties = None
         self.cumulated_frequency_penalties = None
@@ -62,9 +59,7 @@ class BatchedFrequencyPenalizer(_BatchedPenalizer):
         logits -= self.cumulated_frequency_penalties
         return logits
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.frequency_penalties = self.frequency_penalties[indices_tensor_to_keep]
         self.cumulated_frequency_penalties = self.cumulated_frequency_penalties[
             indices_tensor_to_keep

sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import typing
+from typing import List
 import torch
-from ..orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
 class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
@@ -70,10 +70,6 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
         )
     def _teardown(self):
-        del self.min_new_tokens
-        del self.stop_token_penalties
-        del self.len_output_tokens
         self.min_new_tokens = None
         self.stop_token_penalties = None
         self.len_output_tokens = None
@@ -89,9 +85,7 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
         logits[mask] += self.stop_token_penalties[mask]
         return logits
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.min_new_tokens = self.min_new_tokens[indices_tensor_to_keep]
         self.stop_token_penalties = self.stop_token_penalties[indices_tensor_to_keep]
         self.len_output_tokens = self.len_output_tokens[indices_tensor_to_keep]

sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import typing
+from typing import List
 import torch
-from ..orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
 class BatchedPresencePenalizer(_BatchedPenalizer):
@@ -44,9 +44,6 @@ class BatchedPresencePenalizer(_BatchedPenalizer):
         )
     def _teardown(self):
-        del self.presence_penalties
-        del self.cumulated_presence_penalties
         self.presence_penalties = None
         self.cumulated_presence_penalties = None
@@ -61,9 +58,7 @@ class BatchedPresencePenalizer(_BatchedPenalizer):
         logits -= self.cumulated_presence_penalties
         return logits
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.presence_penalties = self.presence_penalties[indices_tensor_to_keep]
         self.cumulated_presence_penalties = self.cumulated_presence_penalties[
             indices_tensor_to_keep

sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import typing
+from typing import List
 import torch
-from ..orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -44,9 +44,6 @@ class BatchedRepetitionPenalizer(_BatchedPenalizer):
         )
     def _teardown(self):
-        del self.repetition_penalties
-        del self.cumulated_repetition_penalties
         self.repetition_penalties = None
         self.cumulated_repetition_penalties = None
@@ -65,9 +62,7 @@ class BatchedRepetitionPenalizer(_BatchedPenalizer):
             logits * self.cumulated_repetition_penalties,
         )
-    def _filter(
-        self, indices_to_keep: typing.List[int], indices_tensor_to_keep: torch.Tensor
-    ):
+    def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]
         self.cumulated_repetition_penalties = self.cumulated_repetition_penalties[
             indices_tensor_to_keep

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from __future__ import annotations
 import dataclasses
-from typing import TYPE_CHECKING, List, Optional
+import logging
+import threading
+from typing import TYPE_CHECKING, Callable, List, Optional
 import torch
 import sglang.srt.sampling.penaltylib as penaltylib
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import ScheduleBatch
@@ -27,10 +32,11 @@ class SamplingBatchInfo:
     # Bias Tensors
     vocab_size: int
+    grammars: Optional[List] = None
+    sampling_info_done: Optional[threading.Event] = None
     logit_bias: torch.Tensor = None
     vocab_mask: Optional[torch.Tensor] = None
-    grammars: Optional[List] = None
+    apply_mask: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
     # Penalizer
     penalizer_orchestrator: Optional[penaltylib.BatchedPenalizerOrchestrator] = None
@@ -42,10 +48,7 @@ class SamplingBatchInfo:
     @classmethod
     def from_schedule_batch(
-        cls,
-        batch: ScheduleBatch,
-        vocab_size: int,
-        disable_penalizer: bool,
+        cls, batch: ScheduleBatch, vocab_size: int, enable_overlap_schedule: bool
     ):
         reqs = batch.reqs
         device = batch.device
@@ -73,12 +76,39 @@ class SamplingBatchInfo:
             top_ks=top_ks,
             min_ps=min_ps,
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
-            is_all_greedy=top_ks.max().item() <= 1,
+            is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
             vocab_size=vocab_size,
             device=device,
         )
         # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
+        if enable_overlap_schedule:
+            # TODO (lianmin): Some penalizers such as frequency and presence depend on model outputs,
+            # so it is kind of tricky to make it work with overlap scheduler.
+            # It requires correcly updating the penalty logits before the sampling and syncing the events.
+            # We will support them later.
+            penalizers = {
+                penaltylib.BatchedMinNewTokensPenalizer,
+            }
+            if (
+                any(req.sampling_params.frequency_penalty != 0.0 for req in reqs)
+                or any(req.sampling_params.presence_penalty != 0.0 for req in reqs)
+                or any(req.sampling_params.repetition_penalty != 1.0 for req in reqs)
+            ):
+                logger.warning(
+                    "frequency_penalty, presence_penalty, and repetition_penalty are not supported "
+                    "when using the default overlap scheduler. They will be ignored. "
+                    "Please add `--disable-overlap` when launching the server if you need these features. "
+                    "The speed will be slower in that case."
+                )
+        else:
+            penalizers = {
+                penaltylib.BatchedFrequencyPenalizer,
+                penaltylib.BatchedMinNewTokensPenalizer,
+                penaltylib.BatchedPresencePenalizer,
+                penaltylib.BatchedRepetitionPenalizer,
+            }
         # Each penalizers will do nothing if they evaluate themselves as not required by looking at
         # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
         # should not add hefty computation overhead other than simple checks.
@@ -86,20 +116,12 @@ class SamplingBatchInfo:
         # While we choose not to even create the class instances if they are not required, this
         # could add additional complexity to the {ScheduleBatch} class, especially we need to
         # handle {filter_batch()} and {merge_batch()} cases as well.
-        if disable_penalizer:
-            ret.penalizer_orchestrator = None
-        else:
-            ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
-                vocab_size=vocab_size,
-                batch=batch,
-                device=batch.device,
-                Penalizers={
-                    penaltylib.BatchedFrequencyPenalizer,
-                    penaltylib.BatchedMinNewTokensPenalizer,
-                    penaltylib.BatchedPresencePenalizer,
-                    penaltylib.BatchedRepetitionPenalizer,
-                },
-            )
+        ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
+            vocab_size=vocab_size,
+            batch=batch,
+            device=batch.device,
+            Penalizers=penalizers,
+        )
         # Handle logit bias but only allocate when needed
         ret.logit_bias = None
@@ -110,9 +132,6 @@ class SamplingBatchInfo:
         return len(self.temperatures)
     def update_penalties(self):
-        if not self.penalizer_orchestrator:
-            return
         self.scaling_penalties = None
         self.linear_penalties = None
@@ -133,23 +152,28 @@ class SamplingBatchInfo:
                 self.linear_penalties = penalizer.apply(self.linear_penalties)
     def update_regex_vocab_mask(self):
-        if not self.grammars or not any(grammar for grammar in self.grammars):
+        if not self.grammars:
             self.vocab_mask = None
+            self.apply_mask = None
             return
-        self.vocab_mask = torch.zeros(
-            len(self.temperatures),
-            self.vocab_size,
-            dtype=torch.bool,
+        # find a grammar from the list
+        grammar = next(grammar for grammar in self.grammars if grammar)
+        # maybe we can reuse the existing mask?
+        self.vocab_mask = grammar.allocate_vocab_mask(
+            vocab_size=self.vocab_size,
+            batch_size=len(self.temperatures),
             device=self.device,
         )
+        self.apply_mask = type(grammar).apply_vocab_mask  # force to use static method
         for i, grammar in enumerate(self.grammars):
             if grammar is not None:
-                grammar.fill_vocab_mask(self.vocab_mask[i])
+                grammar.fill_vocab_mask(self.vocab_mask, i)
     def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
-        if self.penalizer_orchestrator:
-            self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
+        self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
         for item in [
             "temperatures",
@@ -188,8 +212,7 @@ class SamplingBatchInfo:
         return None
     def merge_batch(self, other: "SamplingBatchInfo"):
-        if self.penalizer_orchestrator:
-            self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+        self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
         for item in [
             "temperatures",
@@ -205,25 +228,3 @@ class SamplingBatchInfo:
         self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
             self.logit_bias, other.logit_bias, len(self), len(other), self.device
         )
-    def copy(self):
-        return SamplingBatchInfo(
-            temperatures=self.temperatures,
-            top_ps=self.top_ps,
-            top_ks=self.top_ks,
-            min_ps=self.min_ps,
-            is_all_greedy=self.is_all_greedy,
-            need_min_p_sampling=self.need_min_p_sampling,
-            vocab_size=self.vocab_size,
-            device=self.device,
-        )
-    def to(self, device: str):
-        for item in [
-            "temperatures",
-            "top_ps",
-            "top_ks",
-            "min_ps",
-        ]:
-            value = getattr(self, item)
-            setattr(self, item, value.to(device, non_blocking=True))

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -24,7 +24,6 @@ class SamplingParams:
     def __init__(
         self,
         max_new_tokens: int = 128,
-        min_new_tokens: int = 0,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
         temperature: float = 1.0,
@@ -34,13 +33,14 @@ class SamplingParams:
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repetition_penalty: float = 1.0,
-        ignore_eos: bool = False,
-        skip_special_tokens: bool = True,
+        min_new_tokens: int = 0,
         spaces_between_special_tokens: bool = True,
         regex: Optional[str] = None,
         n: int = 1,
         json_schema: Optional[str] = None,
         no_stop_trim: bool = False,
+        ignore_eos: bool = False,
+        skip_special_tokens: bool = True,
     ) -> None:
         self.temperature = temperature
         self.top_p = top_p

sglang/srt/server.py CHANGED Viewed

@@ -50,8 +50,10 @@ from sglang.srt.managers.data_parallel_controller import (
 )
 from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
 from sglang.srt.managers.io_struct import (
+    CloseSessionReqInput,
     EmbeddingReqInput,
     GenerateReqInput,
+    OpenSessionReqInput,
     UpdateWeightReqInput,
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
@@ -139,6 +141,7 @@ async def get_model_info():
     """Get the model information."""
     result = {
         "model_path": tokenizer_manager.model_path,
+        "tokenizer_path": tokenizer_manager.server_args.tokenizer_path,
         "is_generation": tokenizer_manager.is_generation,
     }
     return result
@@ -214,6 +217,30 @@ async def update_weights(obj: UpdateWeightReqInput, request: Request):
         )
+@app.api_route("/open_session", methods=["GET", "POST"])
+async def open_session(obj: OpenSessionReqInput, request: Request):
+    """Open a session, and return its unique session id."""
+    try:
+        session_id = await tokenizer_manager.open_session(obj, request)
+        return session_id
+    except Exception as e:
+        return ORJSONResponse(
+            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+        )
+@app.api_route("/close_session", methods=["GET", "POST"])
+async def close_session(obj: CloseSessionReqInput, request: Request):
+    """Close the session"""
+    try:
+        await tokenizer_manager.close_session(obj, request)
+        return Response(status_code=200)
+    except Exception as e:
+        return ORJSONResponse(
+            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+        )
 @time_func_latency
 async def generate_request(obj: GenerateReqInput, request: Request):
     """Handle a generate request."""
@@ -391,7 +418,7 @@ def launch_engine(
         )
         for tp_rank in tp_rank_range:
             reader, writer = mp.Pipe(duplex=False)
-            gpu_id = tp_rank % tp_size_per_node
+            gpu_id = server_args.base_gpu_id + tp_rank % tp_size_per_node
             proc = mp.Process(
                 target=run_scheduler_process,
                 args=(server_args, port_args, gpu_id, tp_rank, None, writer),
@@ -768,7 +795,7 @@ class Engine:
         self,
         # The input prompt. It can be a single prompt or a batch of prompts.
         prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Dict] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
         # The token ids for text; one can either specify text or input_ids.
         input_ids: Optional[Union[List[List[int]], List[int]]] = None,
         return_logprob: Optional[Union[List[bool], bool]] = False,

sglang 0.3.5.post1__py3-none-any.whl → 0.3.6__py3-none-any.whl

sglang 0.3.5.post1py3-none-any.whl → 0.3.6py3-none-any.whl