PyPI - sglang - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

sglang 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

sglang/__init__.py +8 -0
sglang/api.py +10 -2
sglang/bench_latency.py +151 -40
sglang/bench_serving.py +46 -22
sglang/check_env.py +24 -2
sglang/global_config.py +0 -1
sglang/lang/backend/base_backend.py +3 -1
sglang/lang/backend/openai.py +8 -3
sglang/lang/backend/runtime_endpoint.py +46 -29
sglang/lang/choices.py +164 -0
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +6 -13
sglang/lang/ir.py +14 -5
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/layers/activation.py +33 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +6 -1
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +6 -1
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +4 -7
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +174 -380
sglang/srt/managers/tokenizer_manager.py +197 -112
sglang/srt/managers/tp_worker.py +299 -364
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +10 -15
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +27 -12
sglang/srt/model_executor/forward_batch_info.py +319 -0
sglang/srt/model_executor/model_runner.py +30 -47
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +1 -1
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -2
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +1 -1
sglang/srt/models/internlm2.py +3 -8
sglang/srt/models/llama2.py +5 -5
sglang/srt/models/llama_classification.py +1 -1
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/llava.py +1 -2
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -1
sglang/srt/models/mixtral.py +1 -1
sglang/srt/models/mixtral_quant.py +1 -1
sglang/srt/models/qwen.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +1 -12
sglang/srt/models/stablelm.py +1 -1
sglang/srt/openai_api/adapter.py +189 -39
sglang/srt/openai_api/protocol.py +43 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -4
sglang/srt/server.py +93 -21
sglang/srt/server_args.py +30 -19
sglang/srt/utils.py +31 -13
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +63 -63
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +4 -2
sglang/test/test_utils.py +21 -3
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/METADATA +50 -31
sglang-0.2.12.dist-info/RECORD +112 -0
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang-0.2.10.dist-info/RECORD +0 -100
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -18,19 +18,18 @@ limitations under the License.
 import logging
 import warnings
 from dataclasses import dataclass
-from enum import IntEnum, auto
-from typing import List, Union
+from typing import List, Optional, Union
-import numpy as np
 import torch
 from flashinfer.sampling import top_k_top_p_sampling_from_probs
+import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.global_config import global_config
 from sglang.srt.constrained import RegexGuide
 from sglang.srt.constrained.jump_forward import JumpForwardMap
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
-from sglang.srt.mem_cache.radix_cache import RadixCache
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
@@ -46,15 +45,6 @@ global_server_args_dict = {
 logger = logging.getLogger(__name__)
-class ForwardMode(IntEnum):
-    # Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
-    PREFILL = auto()
-    # Extend a sequence. The KV cache of the first part of the sequence is already computed (e.g., system prompt).
-    EXTEND = auto()
-    # Decode one token.
-    DECODE = auto()
 class BaseFinishReason:
     def __init__(self, is_error: bool = False):
         self.is_error = is_error
@@ -108,7 +98,10 @@ class Req:
         self.origin_input_ids_unpadded = origin_input_ids  # Before image padding
         self.origin_input_ids = origin_input_ids
         self.output_ids = []  # Each decode stage's output ids
-        self.input_ids = None  # input_ids = origin_input_ids + output_ids
+        self.fill_ids = None  # fill_ids = origin_input_ids + output_ids
+        # Memory info
+        self.req_pool_idx = None
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -131,7 +124,7 @@ class Req:
         # For vision input
         self.pixel_values = None
         self.image_size = None
-        self.image_offset = 0
+        self.image_offset = None
         self.pad_value = None
         # Prefix info
@@ -149,6 +142,7 @@ class Req:
         # Logprobs
         self.return_logprob = False
+        self.embedding = None
         self.logprob_start_len = 0
         self.top_logprobs_num = 0
         self.normalized_prompt_logprob = None
@@ -169,6 +163,32 @@ class Req:
     def finished(self) -> bool:
         return self.finished_reason is not None
+    def init_next_round_input(self, tree_cache: Optional[BasePrefixCache] = None):
+        self.fill_ids = self.origin_input_ids + self.output_ids
+        if tree_cache is not None:
+            self.prefix_indices, self.last_node = tree_cache.match_prefix(
+                rid=self.rid, key=self.adjust_max_prefix_ids()
+            )
+        self.extend_input_len = len(self.fill_ids) - len(self.prefix_indices)
+    def adjust_max_prefix_ids(self):
+        self.fill_ids = self.origin_input_ids + self.output_ids
+        input_len = len(self.fill_ids)
+        max_prefix_len = input_len
+        if self.sampling_params.max_new_tokens > 0:
+            # Need at least one token to compute logits
+            max_prefix_len = min(max_prefix_len, input_len - 1)
+        if self.return_logprob:
+            max_prefix_len = min(max_prefix_len, self.logprob_start_len)
+            if self.normalized_prompt_logprob is None:
+                # Need at least two tokens to compute normalized logprob
+                max_prefix_len = min(max_prefix_len, input_len - 2)
+        return self.fill_ids[:max_prefix_len]
     # Based on https://github.com/vllm-project/vllm/blob/7a64d24aad69e4d2548aa0bf528d9fe63428ab01/vllm/transformers_utils/detokenizer.py#L194-L313
     def init_incremental_detokenize(self):
         first_iter = self.surr_offset is None or self.read_offset is None
@@ -183,6 +203,8 @@ class Req:
         return all_ids[self.surr_offset :], self.read_offset - self.surr_offset
     def get_next_inc_detokenization(self):
+        if self.tokenizer is None:
+            return False, ""
         read_ids, read_offset = self.init_incremental_detokenize()
         surr_ids = read_ids[:read_offset]
@@ -207,16 +229,18 @@ class Req:
             return
         if len(self.output_ids) >= self.sampling_params.max_new_tokens:
-            self.finished_reason = FINISH_LENGTH(len(self.output_ids))
+            self.finished_reason = FINISH_LENGTH(
+                length=self.sampling_params.max_new_tokens
+            )
             return
-        if (
-            self.output_ids[-1] == self.tokenizer.eos_token_id
-            and not self.sampling_params.ignore_eos
-        ):
-            self.finished_reason = FINISH_MATCHED_TOKEN(
-                matched=self.tokenizer.eos_token_id
-            )
+        last_token_id = self.output_ids[-1]
+        if self.tokenizer is None:
+            matched_eos = last_token_id in self.sampling_params.stop_token_ids
+        else:
+            matched_eos = last_token_id == self.tokenizer.eos_token_id
+        if matched_eos and not self.sampling_params.ignore_eos:
+            self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
             return
         if len(self.sampling_params.stop_strs) > 0:
@@ -284,20 +308,19 @@ class Req:
 @dataclass
-class Batch:
+class ScheduleBatch:
     """Store all inforamtion of a batch."""
     # Request, memory pool, and cache
     reqs: List[Req]
     req_to_token_pool: ReqToTokenPool
     token_to_kv_pool: BaseTokenToKVPool
-    tree_cache: RadixCache
+    tree_cache: BasePrefixCache
     # Batched arguments to model runner
     input_ids: torch.Tensor = None
     req_pool_indices: torch.Tensor = None
     seq_lens: torch.Tensor = None
-    prefix_lens: torch.Tensor = None
     position_ids_offsets: torch.Tensor = None
     out_cache_loc: torch.Tensor = None
     extend_num_tokens: int = None
@@ -306,17 +329,11 @@ class Batch:
     return_logprob: bool = False
     top_logprobs_nums: List[int] = None
-    # For multimodal
-    pixel_values: List[torch.Tensor] = None
-    image_sizes: List[List[int]] = None
-    image_offsets: List[int] = None
     # Batched sampling params
     temperatures: torch.Tensor = None
     top_ps: torch.Tensor = None
     top_ks: torch.Tensor = None
-    frequency_penalties: torch.Tensor = None
-    presence_penalties: torch.Tensor = None
+    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
     logit_bias: torch.Tensor = None
     @classmethod
@@ -331,6 +348,9 @@ class Batch:
             return_logprob=return_logprob,
         )
+    def batch_size(self):
+        return len(self.reqs) if self.reqs is not None else 0
     def is_empty(self):
         return len(self.reqs) == 0
@@ -338,52 +358,22 @@ class Batch:
         # Return whether batch has at least 1 streaming request
         return any(r.stream for r in self.reqs)
-    def prepare_for_extend(self, vocab_size: int, int_token_logit_bias: torch.Tensor):
-        device = "cuda"
-        bs = len(self.reqs)
-        reqs = self.reqs
-        input_ids = [r.input_ids[len(r.prefix_indices) :] for r in reqs]
-        prefix_indices = [r.prefix_indices for r in reqs]
-        # Handle prefix
-        flatten_input_ids = []
-        extend_lens = []
-        prefix_lens = []
-        seq_lens = []
-        req_pool_indices = self.req_to_token_pool.alloc(bs)
+    def alloc_req_slots(self, num_reqs):
+        req_pool_indices = self.req_to_token_pool.alloc(num_reqs)
         if req_pool_indices is None:
             raise RuntimeError(
                 "Out of memory. "
                 "Please set a smaller number for `--max-running-requests`."
             )
+        return req_pool_indices
-        req_pool_indices_cpu = req_pool_indices.cpu().numpy()
-        for i in range(bs):
-            flatten_input_ids.extend(input_ids[i])
-            extend_lens.append(len(input_ids[i]))
-            if len(prefix_indices[i]) == 0:
-                prefix_lens.append(0)
-            else:
-                prefix_lens.append(len(prefix_indices[i]))
-                self.req_to_token_pool.req_to_token[req_pool_indices_cpu[i]][
-                    : len(prefix_indices[i])
-                ] = prefix_indices[i]
-            seq_lens.append(prefix_lens[-1] + extend_lens[-1])
+    def alloc_token_slots(self, num_tokens: int):
+        out_cache_loc = self.token_to_kv_pool.alloc(num_tokens)
-        position_ids_offsets = torch.zeros((bs,), dtype=torch.int32, device=device)
-        # Allocate memory
-        seq_lens, prefix_lens = np.array(seq_lens), np.array(prefix_lens)
-        extend_num_tokens = seq_lens.sum() - prefix_lens.sum()
-        out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
         if out_cache_loc is None:
             if self.tree_cache is not None:
-                self.tree_cache.evict(extend_num_tokens, self.token_to_kv_pool.free)
-                out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
+                self.tree_cache.evict(num_tokens, self.token_to_kv_pool.free)
+                out_cache_loc = self.token_to_kv_pool.alloc(num_tokens)
             if out_cache_loc is None:
                 logger.error("Prefill out of memory. Try to lower your batch size.")
@@ -391,40 +381,11 @@ class Batch:
                     self.tree_cache.pretty_print()
                 exit(1)
-        pt = 0
-        for i in range(bs):
-            self.req_to_token_pool.req_to_token[req_pool_indices_cpu[i]][
-                prefix_lens[i] : prefix_lens[i] + extend_lens[i]
-            ] = out_cache_loc[pt : pt + extend_lens[i]]
-            pt += extend_lens[i]
-        # Handle logit bias but only allocate when needed
-        logit_bias = None
-        for i in range(bs):
-            if reqs[i].sampling_params.dtype == "int":
-                if logit_bias is None:
-                    logit_bias = torch.zeros(
-                        (bs, vocab_size), dtype=torch.float32, device=device
-                    )
-                logit_bias[i][: len(int_token_logit_bias)] = int_token_logit_bias
-        # Set fields
-        self.input_ids = torch.tensor(
-            flatten_input_ids, dtype=torch.int32, device=device
-        )
-        self.pixel_values = [r.pixel_values for r in reqs]
-        self.image_sizes = [r.image_size for r in reqs]
-        self.image_offsets = [
-            r.image_offset - p_len for r, p_len in zip(reqs, prefix_lens)
-        ]
-        self.req_pool_indices = req_pool_indices
-        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32, device=device)
-        self.prefix_lens = torch.tensor(prefix_lens, dtype=torch.int32, device=device)
-        self.position_ids_offsets = position_ids_offsets
-        self.extend_num_tokens = extend_num_tokens
-        self.out_cache_loc = out_cache_loc
-        self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
+        return out_cache_loc
+    def batch_sampling_params(self, vocab_size, int_token_logit_bias):
+        device = "cuda"
+        bs, reqs = self.batch_size(), self.reqs
         self.temperatures = torch.tensor(
             [r.sampling_params.temperature for r in reqs],
             dtype=torch.float,
@@ -436,20 +397,79 @@ class Batch:
         self.top_ks = torch.tensor(
             [r.sampling_params.top_k for r in reqs], dtype=torch.int, device=device
         )
-        self.frequency_penalties = torch.tensor(
-            [r.sampling_params.frequency_penalty for r in reqs],
-            dtype=torch.float,
-            device=device,
-        )
-        self.presence_penalties = torch.tensor(
-            [r.sampling_params.presence_penalty for r in reqs],
-            dtype=torch.float,
+        # Each penalizers will do nothing if they evaluate themselves as not required by looking at
+        # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
+        # should not add hefty computation overhead other than simple checks.
+        #
+        # While we choose not to even create the class instances if they are not required, this
+        # could add additional complexity to the {ScheduleBatch} class, especially we need to
+        # handle {filter_batch()} and {merge()} cases as well.
+        self.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
+            vocab_size=vocab_size,
+            batch=self,
             device=device,
+            Penalizers={
+                penaltylib.BatchedFrequencyPenalizer,
+                penaltylib.BatchedMinNewTokensPenalizer,
+                penaltylib.BatchedPresencePenalizer,
+                penaltylib.BatchedRepetitionPenalizer,
+            },
         )
-        self.logit_bias = logit_bias
+        # Handle logit bias but only allocate when needed
+        self.logit_bias = None
+        for i in range(bs):
+            if reqs[i].sampling_params.dtype == "int":
+                if self.logit_bias is None:
+                    self.logit_bias = torch.zeros(
+                        (bs, vocab_size), dtype=torch.float32, device=device
+                    )
+                self.logit_bias[i][: len(int_token_logit_bias)] = int_token_logit_bias
+    def prepare_for_extend(self, vocab_size: int, int_token_logit_bias: torch.Tensor):
+        bs = self.batch_size()
+        reqs = self.reqs
+        input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
+        extend_num_tokens = sum(len(ids) for ids in input_ids)
+        seq_lens = []
+        # Allocate memory
+        req_pool_indices_cpu = self.alloc_req_slots(bs)
+        out_cache_loc = self.alloc_token_slots(extend_num_tokens)
+        pt = 0
+        for i, req in enumerate(reqs):
+            req.req_pool_idx = req_pool_indices_cpu[i]
+            pre_len, seq_len = len(req.prefix_indices), len(req.fill_ids)
+            ext_len = seq_len - pre_len
+            seq_lens.append(seq_len)
+            if pre_len > 0:
+                self.req_to_token_pool.req_to_token[req.req_pool_idx][
+                    :pre_len
+                ] = req.prefix_indices
+            self.req_to_token_pool.req_to_token[req.req_pool_idx][pre_len:seq_len] = (
+                out_cache_loc[pt : pt + ext_len]
+            )
+            pt += ext_len
+        # Set fields
+        with torch.device("cuda"):
+            self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32)
+            self.req_pool_indices = torch.tensor(req_pool_indices_cpu)
+            self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32)
+            self.position_ids_offsets = torch.zeros((bs,), dtype=torch.int64)
+        self.extend_num_tokens = extend_num_tokens
+        self.out_cache_loc = out_cache_loc
+        self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
+        self.batch_sampling_params(vocab_size, int_token_logit_bias)
     def check_decode_mem(self):
-        bs = len(self.reqs)
+        bs = self.batch_size()
         if self.token_to_kv_pool.available_size() >= bs:
             return True
@@ -474,7 +494,6 @@ class Batch:
         retracted_reqs = []
         seq_lens_cpu = self.seq_lens.cpu().numpy()
-        req_pool_indices_cpu = self.req_pool_indices.cpu().numpy()
         while (
             self.token_to_kv_pool.available_size()
             < len(sorted_indices) * global_config.retract_decode_steps
@@ -492,20 +511,20 @@ class Batch:
             if isinstance(self.tree_cache, ChunkCache):
                 # ChunkCache does not have eviction
-                token_indices = self.req_to_token_pool.req_to_token[
-                    req_pool_indices_cpu[idx]
-                ][: seq_lens_cpu[idx]]
+                token_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx][
+                    : seq_lens_cpu[idx]
+                ]
                 self.token_to_kv_pool.free(token_indices)
-                self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
+                self.req_to_token_pool.free(req.req_pool_idx)
                 del self.tree_cache.entries[req.rid]
             else:
                 # TODO: apply more fine-grained retraction
                 last_uncached_pos = len(req.prefix_indices)
-                token_indices = self.req_to_token_pool.req_to_token[
-                    req_pool_indices_cpu[idx]
-                ][last_uncached_pos : seq_lens_cpu[idx]]
+                token_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx][
+                    last_uncached_pos : seq_lens_cpu[idx]
+                ]
                 self.token_to_kv_pool.free(token_indices)
-                self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
+                self.req_to_token_pool.free(req.req_pool_idx)
                 # release the last node
                 self.tree_cache.dec_lock_ref(req.last_node)
@@ -518,7 +537,7 @@ class Batch:
                 residual_size = max(0, residual_size)
                 self.tree_cache.evict(residual_size, self.token_to_kv_pool.free)
-            req.prefix_indices = None
+            req.prefix_indices = []
             req.last_node = None
             req.extend_input_len = 0
@@ -543,8 +562,6 @@ class Batch:
         jump_forward_reqs = []
         filter_indices = [i for i in range(len(self.reqs))]
-        req_pool_indices_cpu = None
         for i, req in enumerate(self.reqs):
             if req.jump_forward_map is not None:
                 jump_forward_bytes = req.jump_forward_map.jump_forward_byte(
@@ -594,17 +611,7 @@ class Batch:
                     req.vid += 1
                     # insert the old request into tree_cache
-                    if req_pool_indices_cpu is None:
-                        req_pool_indices_cpu = self.req_pool_indices.tolist()
-                    self.tree_cache.cache_req(
-                        rid=req.rid,
-                        token_ids=cur_all_ids,
-                        last_uncached_pos=len(req.prefix_indices),
-                        req_pool_idx=req_pool_indices_cpu[i],
-                    )
-                    # unlock the last node
-                    self.tree_cache.dec_lock_ref(req.last_node)
+                    self.tree_cache.cache_finished_req(req, cur_all_ids)
                     # re-applying image padding
                     if req.pixel_values is not None:
@@ -621,66 +628,74 @@ class Batch:
                     jump_forward_reqs.append(req)
                     filter_indices.remove(i)
-        if len(filter_indices) < len(self.reqs):
-            self.filter_batch(filter_indices)
+        self.filter_batch(filter_indices)
         return jump_forward_reqs
     def prepare_for_decode(self, input_ids=None):
         if input_ids is None:
             input_ids = [
-                r.output_ids[-1] if r.output_ids else r.input_ids[-1] for r in self.reqs
+                r.output_ids[-1] if r.output_ids else r.origin_input_ids[-1]
+                for r in self.reqs
             ]
+        else:
+            self.penalizer_orchestrator.cumulate_input_tokens(input_ids)
         self.input_ids = torch.tensor(input_ids, dtype=torch.int32, device="cuda")
         self.seq_lens.add_(1)
-        self.prefix_lens = None
         # Alloc mem
-        bs = len(self.reqs)
-        self.out_cache_loc = self.token_to_kv_pool.alloc(bs)
-        if self.out_cache_loc is None:
-            logger.error("Decode out of memory. Try to lower your batch size.")
-            if self.tree_cache is not None:
-                self.tree_cache.pretty_print()
-            exit(1)
+        bs = self.batch_size()
+        self.out_cache_loc = self.alloc_token_slots(bs)
         self.req_to_token_pool.req_to_token[
             self.req_pool_indices, self.seq_lens - 1
         ] = self.out_cache_loc
     def filter_batch(self, unfinished_indices: List[int]):
+        if unfinished_indices is None or len(unfinished_indices) == 0:
+            # Filter out all requests
+            self.reqs = []
+            return
+        if len(unfinished_indices) == len(self.reqs):
+            # No need to filter
+            return
         self.reqs = [self.reqs[i] for i in unfinished_indices]
         new_indices = torch.tensor(unfinished_indices, dtype=torch.int32, device="cuda")
         self.seq_lens = self.seq_lens[new_indices]
         self.input_ids = None
         self.req_pool_indices = self.req_pool_indices[new_indices]
-        self.prefix_lens = None
         self.position_ids_offsets = self.position_ids_offsets[new_indices]
         self.out_cache_loc = None
         self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in unfinished_indices]
         self.return_logprob = any(req.return_logprob for req in self.reqs)
+        self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
         for item in [
             "temperatures",
             "top_ps",
             "top_ks",
-            "frequency_penalties",
-            "presence_penalties",
             "logit_bias",
         ]:
             self_val = getattr(self, item, None)
             if self_val is not None:  # logit_bias can be None
                 setattr(self, item, self_val[new_indices])
-    def merge(self, other: "Batch"):
+    def merge(self, other: "ScheduleBatch"):
+        # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
+        # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it
+        # needs to be called with pre-merged Batch.reqs.
+        self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
         self.reqs.extend(other.reqs)
         self.req_pool_indices = torch.concat(
             [self.req_pool_indices, other.req_pool_indices]
         )
         self.seq_lens = torch.concat([self.seq_lens, other.seq_lens])
-        self.prefix_lens = None
         self.position_ids_offsets = torch.concat(
             [self.position_ids_offsets, other.position_ids_offsets]
         )
@@ -692,8 +707,6 @@ class Batch:
             "temperatures",
             "top_ps",
             "top_ks",
-            "frequency_penalties",
-            "presence_penalties",
         ]:
             self_val = getattr(self, item, None)
             other_val = getattr(other, item, None)
@@ -717,6 +730,7 @@ class Batch:
             self.logit_bias = torch.concat([self.logit_bias, other.logit_bias])
     def sample(self, logits: torch.Tensor):
+        # TODO(lsyin): move this into a part of layer and run with CUDA Graph
         # Post process logits
         logits = logits.contiguous()
         logits.div_(self.temperatures)
@@ -734,7 +748,8 @@ class Batch:
                     ] = 1
                     logits[i].masked_fill_(~allowed_mask, float("-inf"))
-        # TODO(lmzheng): apply penalty
+        logits = self.penalizer_orchestrator.apply(logits)
         probs = torch.softmax(logits, dim=-1)
         if not global_server_args_dict["disable_flashinfer_sampling"]:
@@ -767,230 +782,9 @@ class Batch:
                         req.regex_fsm_state, batch_next_token_ids_cpu[i]
                     )
-        return batch_next_token_ids
-@dataclass
-class InputMetadata:
-    """Store all inforamtion of a forward pass."""
-    forward_mode: ForwardMode
-    batch_size: int
-    total_num_tokens: int
-    req_pool_indices: torch.Tensor
-    seq_lens: torch.Tensor
-    positions: torch.Tensor
-    req_to_token_pool: ReqToTokenPool
-    token_to_kv_pool: BaseTokenToKVPool
+        self.penalizer_orchestrator.cumulate_output_tokens(batch_next_token_ids)
-    # For extend
-    extend_seq_lens: torch.Tensor
-    extend_start_loc: torch.Tensor
-    extend_no_prefix: bool
-    # Output location of the KV cache
-    out_cache_loc: torch.Tensor = None
-    # Output options
-    return_logprob: bool = False
-    top_logprobs_nums: List[int] = None
-    # Trition attention backend
-    triton_max_seq_len: int = 0
-    triton_max_extend_len: int = 0
-    triton_start_loc: torch.Tensor = None
-    triton_prefix_lens: torch.Tensor = None
-    # FlashInfer attention backend
-    flashinfer_prefill_wrapper_ragged: "BatchPrefillWithRaggedKVCacheWrapper" = None
-    flashinfer_prefill_wrapper_paged: "BatchPrefillWithPagedKVCacheWrapper" = None
-    flashinfer_decode_wrapper: "BatchDecodeWithPagedKVCacheWrapper" = None
-    flashinfer_use_ragged: bool = False
-    @classmethod
-    def create(
-        cls,
-        model_runner,
-        forward_mode,
-        req_pool_indices,
-        seq_lens,
-        prefix_lens,
-        position_ids_offsets,
-        out_cache_loc,
-        top_logprobs_nums=None,
-        return_logprob=False,
-        skip_flashinfer_init=False,
-    ):
-        flashinfer_use_ragged = False
-        if not skip_flashinfer_init and not model_runner.server_args.disable_flashinfer:
-            if forward_mode != ForwardMode.DECODE and int(torch.sum(seq_lens)) > 4096:
-                flashinfer_use_ragged = True
-            init_flashinfer_args(
-                forward_mode,
-                model_runner,
-                req_pool_indices,
-                seq_lens,
-                prefix_lens,
-                model_runner.flashinfer_decode_wrapper,
-                flashinfer_use_ragged,
-            )
-        batch_size = len(req_pool_indices)
-        if forward_mode == ForwardMode.DECODE:
-            positions = ((seq_lens - 1) + position_ids_offsets).to(torch.int64)
-            extend_seq_lens = extend_start_loc = extend_no_prefix = None
-            if not model_runner.server_args.disable_flashinfer:
-                # This variable is not needed in this case,
-                # we do not compute it to make it compatbile with cuda graph.
-                total_num_tokens = None
-            else:
-                total_num_tokens = int(torch.sum(seq_lens))
-        else:
-            seq_lens_cpu = seq_lens.cpu().numpy()
-            prefix_lens_cpu = prefix_lens.cpu().numpy()
-            position_ids_offsets_cpu = position_ids_offsets.cpu().numpy()
-            positions = torch.tensor(
-                np.concatenate(
-                    [
-                        np.arange(
-                            prefix_lens_cpu[i] + position_ids_offsets_cpu[i],
-                            seq_lens_cpu[i] + position_ids_offsets_cpu[i],
-                        )
-                        for i in range(batch_size)
-                    ],
-                    axis=0,
-                ),
-                device="cuda",
-            )
-            extend_seq_lens = seq_lens - prefix_lens
-            extend_start_loc = torch.zeros_like(seq_lens)
-            extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0)
-            extend_no_prefix = torch.all(prefix_lens == 0)
-            total_num_tokens = int(torch.sum(seq_lens))
-        ret = cls(
-            forward_mode=forward_mode,
-            batch_size=batch_size,
-            total_num_tokens=total_num_tokens,
-            req_pool_indices=req_pool_indices,
-            seq_lens=seq_lens,
-            positions=positions,
-            req_to_token_pool=model_runner.req_to_token_pool,
-            token_to_kv_pool=model_runner.token_to_kv_pool,
-            out_cache_loc=out_cache_loc,
-            extend_seq_lens=extend_seq_lens,
-            extend_start_loc=extend_start_loc,
-            extend_no_prefix=extend_no_prefix,
-            return_logprob=return_logprob,
-            top_logprobs_nums=top_logprobs_nums,
-            flashinfer_prefill_wrapper_ragged=model_runner.flashinfer_prefill_wrapper_ragged,
-            flashinfer_prefill_wrapper_paged=model_runner.flashinfer_prefill_wrapper_paged,
-            flashinfer_decode_wrapper=model_runner.flashinfer_decode_wrapper,
-            flashinfer_use_ragged=flashinfer_use_ragged,
-        )
-        if model_runner.server_args.disable_flashinfer:
-            (
-                ret.triton_max_seq_len,
-                ret.triton_max_extend_len,
-                ret.triton_start_loc,
-                ret.triton_prefix_lens,
-            ) = init_triton_args(forward_mode, seq_lens, prefix_lens)
-        return ret
-def init_flashinfer_args(
-    forward_mode,
-    model_runner,
-    req_pool_indices,
-    seq_lens,
-    prefix_lens,
-    flashinfer_decode_wrapper,
-    flashinfer_use_ragged=False,
-):
-    """Init auxiliary variables for FlashInfer attention backend."""
-    num_qo_heads = model_runner.model_config.num_attention_heads // model_runner.tp_size
-    num_kv_heads = model_runner.model_config.get_num_kv_heads(model_runner.tp_size)
-    head_dim = model_runner.model_config.head_dim
-    batch_size = len(req_pool_indices)
-    total_num_tokens = int(torch.sum(seq_lens))
-    if flashinfer_use_ragged:
-        paged_kernel_lens = prefix_lens
-    else:
-        paged_kernel_lens = seq_lens
-    kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
-    kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
-    req_pool_indices_cpu = req_pool_indices.cpu().numpy()
-    paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy()
-    kv_indices = torch.cat(
-        [
-            model_runner.req_to_token_pool.req_to_token[
-                req_pool_indices_cpu[i], : paged_kernel_lens_cpu[i]
-            ]
-            for i in range(batch_size)
-        ],
-        dim=0,
-    ).contiguous()
-    kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda")
-    if forward_mode == ForwardMode.DECODE:
-        flashinfer_decode_wrapper.end_forward()
-        flashinfer_decode_wrapper.begin_forward(
-            kv_indptr,
-            kv_indices,
-            kv_last_page_len,
-            num_qo_heads,
-            num_kv_heads,
-            head_dim,
-            1,
-        )
-    else:
-        # extend part
-        qo_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
-        qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0)
-        if flashinfer_use_ragged:
-            model_runner.flashinfer_prefill_wrapper_ragged.end_forward()
-            model_runner.flashinfer_prefill_wrapper_ragged.begin_forward(
-                qo_indptr,
-                qo_indptr,
-                num_qo_heads,
-                num_kv_heads,
-                head_dim,
-            )
-        # cached part
-        model_runner.flashinfer_prefill_wrapper_paged.end_forward()
-        model_runner.flashinfer_prefill_wrapper_paged.begin_forward(
-            qo_indptr,
-            kv_indptr,
-            kv_indices,
-            kv_last_page_len,
-            num_qo_heads,
-            num_kv_heads,
-            head_dim,
-            1,
-        )
-def init_triton_args(forward_mode, seq_lens, prefix_lens):
-    """Init auxiliary variables for triton attention backend."""
-    batch_size = len(seq_lens)
-    max_seq_len = int(torch.max(seq_lens))
-    start_loc = torch.zeros((batch_size,), dtype=torch.int32, device="cuda")
-    start_loc[1:] = torch.cumsum(seq_lens[:-1], dim=0)
-    if forward_mode == ForwardMode.DECODE:
-        max_extend_len = None
-    else:
-        extend_seq_lens = seq_lens - prefix_lens
-        max_extend_len = int(torch.max(extend_seq_lens))
-    return max_seq_len, max_extend_len, start_loc, prefix_lens
+        return batch_next_token_ids
 def top_k_top_p_sampling_from_probs_torch(
@@ -1009,7 +803,7 @@ def top_k_top_p_sampling_from_probs_torch(
         sampled_index = torch.multinomial(probs_sort, num_samples=1)
     except RuntimeError:
         batch_next_token_ids = torch.zeros(
-            (probs_sort.shape[0],), dtype=torch.int64, device=probs.device
+            (probs_sort.shape[0],), dtype=torch.int32, device=probs.device
         )
         success = torch.zeros(probs.shape[0], dtype=torch.bool, device=probs.device)
         return batch_next_token_ids, success

sglang 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

sglang 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl