PyPI - sglang - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

sglang 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

sglang/__init__.py +33 -26
sglang/api.py +9 -1
sglang/bench_latency.py +2 -2
sglang/bench_serving.py +10 -1
sglang/check_env.py +1 -1
sglang/lang/backend/litellm.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/interpreter.py +21 -5
sglang/lang/ir.py +1 -2
sglang/srt/constrained/__init__.py +15 -0
sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +17 -2
sglang/srt/constrained/fsm_cache.py +17 -2
sglang/srt/constrained/jump_forward.py +17 -2
sglang/srt/conversation.py +26 -0
sglang/srt/hf_transformers_utils.py +15 -0
sglang/srt/layers/context_flashattention_nopad.py +15 -0
sglang/srt/layers/extend_attention.py +15 -0
sglang/srt/layers/fused_moe.py +15 -0
sglang/srt/layers/linear.py +15 -0
sglang/srt/layers/logits_processor.py +41 -13
sglang/srt/layers/quantization/__init__.py +15 -0
sglang/srt/layers/quantization/fp8.py +15 -0
sglang/srt/layers/radix_attention.py +17 -2
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
sglang/srt/managers/detokenizer_manager.py +16 -1
sglang/srt/managers/io_struct.py +36 -3
sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +60 -21
sglang/srt/managers/tokenizer_manager.py +39 -16
sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +159 -46
sglang/srt/mem_cache/base_cache.py +43 -0
sglang/srt/mem_cache/chunk_cache.py +60 -0
sglang/srt/mem_cache/flush_cache.py +33 -0
sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
sglang/srt/{managers/controller → mem_cache}/radix_cache.py +20 -2
sglang/srt/mm_utils.py +15 -0
sglang/srt/model_config.py +15 -0
sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
sglang/srt/{managers/controller → model_executor}/model_runner.py +49 -14
sglang/srt/model_loader/model_loader.py +15 -0
sglang/srt/model_loader/utils.py +16 -1
sglang/srt/models/chatglm.py +16 -1
sglang/srt/models/commandr.py +16 -1
sglang/srt/models/dbrx.py +16 -1
sglang/srt/models/deepseek.py +16 -1
sglang/srt/models/deepseek_v2.py +16 -1
sglang/srt/models/gemma.py +16 -1
sglang/srt/models/gemma2.py +16 -1
sglang/srt/models/gpt_bigcode.py +16 -1
sglang/srt/models/grok.py +16 -1
sglang/srt/models/internlm2.py +16 -1
sglang/srt/models/llama2.py +21 -22
sglang/srt/models/llama_classification.py +16 -1
sglang/srt/models/llava.py +17 -2
sglang/srt/models/llavavid.py +17 -2
sglang/srt/models/minicpm.py +16 -1
sglang/srt/models/mistral.py +15 -0
sglang/srt/models/mixtral.py +16 -1
sglang/srt/models/mixtral_quant.py +16 -1
sglang/srt/models/qwen.py +16 -1
sglang/srt/models/qwen2.py +16 -1
sglang/srt/models/qwen2_moe.py +16 -1
sglang/srt/models/stablelm.py +16 -1
sglang/srt/models/yivl.py +15 -0
sglang/srt/openai_api/adapter.py +569 -131
sglang/srt/openai_api/protocol.py +84 -2
sglang/srt/sampling_params.py +15 -0
sglang/srt/server.py +92 -23
sglang/srt/server_args.py +52 -11
sglang/srt/utils.py +15 -0
sglang/test/test_programs.py +9 -6
sglang/utils.py +22 -0
sglang/version.py +1 -1
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA +33 -7
sglang-0.2.8.dist-info/RECORD +95 -0
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/WHEEL +1 -1
sglang/srt/flush_cache.py +0 -18
sglang-0.2.6.dist-info/RECORD +0 -93
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0

sglang/srt/layers/token_attention.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
@@ -5,7 +20,7 @@ import torch
 import triton
 import triton.language as tl
-from sglang.srt.managers.controller.infer_batch import global_server_args_dict
+from sglang.srt.managers.schedule_batch import global_server_args_dict
 if global_server_args_dict.get("attention_reduce_in_fp32", False):
     REDUCE_TRITON_TYPE = tl.float32

sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 A controller that manages multiple data parallel workers.
 Each data parallel worker can manage multiple tensor parallel workers.
@@ -12,7 +27,7 @@ from enum import Enum, auto
 import numpy as np
 import zmq
-from sglang.srt.managers.controller.manager_single import (
+from sglang.srt.managers.controller_single import (
     start_controller_process as start_controller_process_single,
 )
 from sglang.srt.managers.io_struct import (
@@ -24,7 +39,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import kill_parent_process
 from sglang.utils import get_exception_traceback
-logger = logging.getLogger("srt.controller")
+logger = logging.getLogger(__name__)
 class LoadBalanceMethod(Enum):

sglang/srt/managers/{controller/manager_single.py → controller_single.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """A controller that manages a group of tensor parallel workers."""
 import logging
@@ -7,7 +22,7 @@ from typing import List
 import zmq
-from sglang.srt.managers.controller.tp_worker import (
+from sglang.srt.managers.tp_worker import (
     ModelTpServer,
     broadcast_recv_input,
     launch_tp_servers,
@@ -16,7 +31,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import kill_parent_process
 from sglang.utils import get_exception_traceback
-logger = logging.getLogger("srt.controller")
+logger = logging.getLogger(__name__)
 class ControllerSingle:

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """DetokenizerManager is a process that detokenizes the token ids."""
 import asyncio
@@ -10,8 +25,8 @@ import zmq
 import zmq.asyncio
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.controller.infer_batch import FINISH_MATCHED_STR
 from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut
+from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.utils import find_printable_text, get_exception_traceback, graceful_registry

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 The definition of objects transfered between different
 processes (TokenizerManager, DetokenizerManager, Controller).
@@ -7,7 +22,7 @@ import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
-from sglang.srt.managers.controller.infer_batch import BaseFinishReason
+from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling_params import SamplingParams
@@ -64,8 +79,26 @@ class GenerateReqInput:
             if self.top_logprobs_num is None:
                 self.top_logprobs_num = 0
         else:
-            parallel_sample_num = self.sampling_params.get("n", 1)
+            parallel_sample_num_list = []
+            if isinstance(self.sampling_params, dict):
+                parallel_sample_num = self.sampling_params.get("n", 1)
+            elif isinstance(self.sampling_params, list):
+                for sp in self.sampling_params:
+                    parallel_sample_num = sp.get("n", 1)
+                    parallel_sample_num_list.append(parallel_sample_num)
+                parallel_sample_num = max(parallel_sample_num_list)
+                all_equal = all(
+                    element == parallel_sample_num
+                    for element in parallel_sample_num_list
+                )
+                if parallel_sample_num > 1 and (not all_equal):
+                    ## TODO cope with the case that the parallel_sample_num is different for different samples
+                    raise ValueError(
+                        "The parallel_sample_num should be the same for all samples in sample params."
+                    )
+            else:
+                parallel_sample_num = 1
+            self.parallel_sample_num = parallel_sample_num
             if parallel_sample_num != 1:
                 # parallel sampling +1 represents the original prefill stage

sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} RENAMED Viewed

@@ -1,46 +1,61 @@
-"""Request scheduler heuristic."""
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Request policy scheduler"""
 import random
 from collections import defaultdict
-class ScheduleHeuristic:
+class PolicyScheduler:
     def __init__(
         self,
-        schedule_heuristic,
+        policy,
         max_running_seqs,
         max_prefill_num_tokens,
         max_total_num_tokens,
         tree_cache,
     ):
-        if tree_cache.disable and schedule_heuristic == "lpm":
+        if tree_cache.disable and policy == "lpm":
             # LMP is meaningless when the tree cache is disabled.
-            schedule_heuristic = "fcfs"
+            policy = "fcfs"
-        self.schedule_heuristic = schedule_heuristic
+        self.policy = policy
         self.max_running_seqs = max_running_seqs
         self.max_prefill_num_tokens = max_prefill_num_tokens
         self.max_total_num_tokens = max_total_num_tokens
         self.tree_cache = tree_cache
-    def get_priority_queue(self, forward_queue):
-        if self.schedule_heuristic == "lpm":
+    def get_priority_queue(self, waiting_queue):
+        if self.policy == "lpm":
             # longest prefix match
-            forward_queue.sort(key=lambda x: -len(x.prefix_indices))
-            return forward_queue
-        elif self.schedule_heuristic == "fcfs":
+            waiting_queue.sort(key=lambda x: -len(x.prefix_indices))
+            return waiting_queue
+        elif self.policy == "fcfs":
             # first come first serve
-            return forward_queue
-        elif self.schedule_heuristic == "lof":
+            return waiting_queue
+        elif self.policy == "lof":
             # longest output first
-            forward_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
-            return forward_queue
-        elif self.schedule_heuristic == "random":
-            random.shuffle(forward_queue)
-            return forward_queue
-        elif self.schedule_heuristic == "dfs-weight":
+            waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
+            return waiting_queue
+        elif self.policy == "random":
+            random.shuffle(waiting_queue)
+            return waiting_queue
+        elif self.policy == "dfs-weight":
             last_node_to_reqs = defaultdict(list)
-            for req in forward_queue:
+            for req in waiting_queue:
                 last_node_to_reqs[req.last_node].append(req)
             node_to_weight = defaultdict(int)
@@ -52,10 +67,10 @@ class ScheduleHeuristic:
             self.get_dfs_priority(
                 self.tree_cache.root_node, node_to_weight, last_node_to_reqs, q
             )
-            assert len(q) == len(forward_queue)
+            assert len(q) == len(waiting_queue)
             return q
         else:
-            raise ValueError(f"Unknown schedule_heuristic: {self.schedule_heuristic}")
+            raise ValueError(f"Unknown schedule_policy: {self.policy}")
     def calc_weight(self, cur_node, node_to_weight):
         for child in cur_node.children.values():

sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} RENAMED Viewed

@@ -1,5 +1,21 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Meta data for requests and batches"""
+import logging
 import warnings
 from dataclasses import dataclass
 from enum import IntEnum, auto
@@ -12,8 +28,9 @@ from flashinfer.sampling import top_k_top_p_sampling_from_probs
 from sglang.global_config import global_config
 from sglang.srt.constrained import RegexGuide
 from sglang.srt.constrained.jump_forward import JumpForwardMap
-from sglang.srt.managers.controller.radix_cache import RadixCache
-from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
+from sglang.srt.mem_cache.chunk_cache import ChunkCache
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool
+from sglang.srt.mem_cache.radix_cache import RadixCache
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
@@ -25,6 +42,9 @@ global_server_args_dict = {
 }
+logger = logging.getLogger(__name__)
 class ForwardMode(IntEnum):
     # Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
     PREFILL = auto()
@@ -364,7 +384,7 @@ class Batch:
             out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
             if out_cache_loc is None:
-                print("Prefill out of memory. This should never happen.")
+                logger.error("Prefill out of memory. This should never happen.")
                 self.tree_cache.pretty_print()
                 exit()
@@ -467,15 +487,33 @@ class Batch:
             req = self.reqs[idx]
             retracted_reqs.append(req)
-            # TODO: apply more fine-grained retraction
-            last_uncached_pos = len(req.prefix_indices)
-            token_indices = self.req_to_token_pool.req_to_token[
-                req_pool_indices_cpu[idx]
-            ][last_uncached_pos : seq_lens_cpu[idx]]
-            self.token_to_kv_pool.free(token_indices)
-            # release the last node
-            self.tree_cache.dec_lock_ref(req.last_node)
+            if isinstance(self.tree_cache, ChunkCache):
+                # ChunkCache does not have eviction
+                token_indices = self.req_to_token_pool.req_to_token[
+                    req_pool_indices_cpu[idx]
+                ][: seq_lens_cpu[idx]]
+                self.token_to_kv_pool.free(token_indices)
+                self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
+                del self.tree_cache.entries[req.rid]
+            else:
+                # TODO: apply more fine-grained retraction
+                last_uncached_pos = len(req.prefix_indices)
+                token_indices = self.req_to_token_pool.req_to_token[
+                    req_pool_indices_cpu[idx]
+                ][last_uncached_pos : seq_lens_cpu[idx]]
+                self.token_to_kv_pool.free(token_indices)
+                self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
+                # release the last node
+                self.tree_cache.dec_lock_ref(req.last_node)
+                # NOTE(lsyin): we should use the newly evictable memory instantly.
+                residual_size = (
+                    len(sorted_indices) * global_config.retract_decode_steps
+                    - self.token_to_kv_pool.available_size()
+                )
+                residual_size = max(0, residual_size)
+                self.tree_cache.evict(residual_size, self.token_to_kv_pool.free)
             req.prefix_indices = None
             req.last_node = None
@@ -556,6 +594,7 @@ class Batch:
                     if req_pool_indices_cpu is None:
                         req_pool_indices_cpu = self.req_pool_indices.tolist()
                     self.tree_cache.cache_req(
+                        rid=req.rid,
                         token_ids=cur_all_ids,
                         last_uncached_pos=len(req.prefix_indices),
                         req_pool_idx=req_pool_indices_cpu[i],
@@ -598,7 +637,7 @@ class Batch:
         self.out_cache_loc = self.token_to_kv_pool.alloc(bs)
         if self.out_cache_loc is None:
-            print("Decode out of memory. This should never happen.")
+            logger.error("Decode out of memory. This should never happen.")
             self.tree_cache.pretty_print()
             exit()
@@ -762,7 +801,7 @@ class InputMetadata:
     flashinfer_prefill_wrapper_ragged: "BatchPrefillWithRaggedKVCacheWrapper" = None
     flashinfer_prefill_wrapper_paged: "BatchPrefillWithPagedKVCacheWrapper" = None
     flashinfer_decode_wrapper: "BatchDecodeWithPagedKVCacheWrapper" = None
-    use_ragged: bool = False
+    flashinfer_use_ragged: bool = False
     @classmethod
     def create(
@@ -778,10 +817,10 @@ class InputMetadata:
         return_logprob=False,
         skip_flashinfer_init=False,
     ):
-        use_ragged = False
+        flashinfer_use_ragged = False
         if not skip_flashinfer_init and not model_runner.server_args.disable_flashinfer:
             if forward_mode != ForwardMode.DECODE and int(torch.sum(seq_lens)) > 4096:
-                use_ragged = True
+                flashinfer_use_ragged = True
             init_flashinfer_args(
                 forward_mode,
                 model_runner,
@@ -789,7 +828,7 @@ class InputMetadata:
                 seq_lens,
                 prefix_lens,
                 model_runner.flashinfer_decode_wrapper,
-                use_ragged,
+                flashinfer_use_ragged,
             )
         batch_size = len(req_pool_indices)
@@ -844,7 +883,7 @@ class InputMetadata:
             flashinfer_prefill_wrapper_ragged=model_runner.flashinfer_prefill_wrapper_ragged,
             flashinfer_prefill_wrapper_paged=model_runner.flashinfer_prefill_wrapper_paged,
             flashinfer_decode_wrapper=model_runner.flashinfer_decode_wrapper,
-            use_ragged=use_ragged,
+            flashinfer_use_ragged=flashinfer_use_ragged,
         )
         if model_runner.server_args.disable_flashinfer:
@@ -865,7 +904,7 @@ def init_flashinfer_args(
     seq_lens,
     prefix_lens,
     flashinfer_decode_wrapper,
-    use_ragged=False,
+    flashinfer_use_ragged=False,
 ):
     """Init auxiliary variables for FlashInfer attention backend."""
     num_qo_heads = model_runner.model_config.num_attention_heads // model_runner.tp_size
@@ -874,7 +913,7 @@ def init_flashinfer_args(
     batch_size = len(req_pool_indices)
     total_num_tokens = int(torch.sum(seq_lens))
-    if use_ragged:
+    if flashinfer_use_ragged:
         paged_kernel_lens = prefix_lens
     else:
         paged_kernel_lens = seq_lens
@@ -910,7 +949,7 @@ def init_flashinfer_args(
         qo_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
         qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0)
-        if use_ragged:
+        if flashinfer_use_ragged:
             model_runner.flashinfer_prefill_wrapper_ragged.end_forward()
             model_runner.flashinfer_prefill_wrapper_ragged.begin_forward(
                 qo_indptr,

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """TokenizerManager is a process that tokenizes the text."""
 import asyncio
@@ -6,7 +21,7 @@ import dataclasses
 import logging
 import multiprocessing as mp
 import os
-from typing import Dict, List
+from typing import Dict, List, Tuple
 import numpy as np
 import transformers
@@ -69,6 +84,7 @@ class TokenizerManager:
             trust_remote_code=server_args.trust_remote_code,
             model_overide_args=model_overide_args,
         )
         if server_args.context_length is not None:
             self.context_len = server_args.context_length
         else:
@@ -137,31 +153,33 @@ class TokenizerManager:
         self, obj, request, index=None, is_cache_for_prefill=False
     ):
         if not is_cache_for_prefill:
-            rid = obj.rid if index is None else obj.rid[index]
-            input_text = obj.text if index is None else obj.text[index]
+            not_use_index = not (index is not None)
+            rid = obj.rid if not_use_index else obj.rid[index]
+            input_text = obj.text if not_use_index else obj.text[index]
             input_ids = (
                 self.tokenizer.encode(input_text)
                 if obj.input_ids is None
                 else obj.input_ids
             )
-            if index is not None and obj.input_ids:
+            if not not_use_index and obj.input_ids:
                 input_ids = obj.input_ids[index]
             self._validate_input_length(input_ids)
             sampling_params = self._get_sampling_params(
-                obj.sampling_params if index is None else obj.sampling_params[index]
+                obj.sampling_params if not_use_index else obj.sampling_params[index]
             )
             pixel_values, image_hash, image_size = await self._get_pixel_values(
-                obj.image_data if index is None else obj.image_data[index]
+                obj.image_data if not_use_index else obj.image_data[index]
             )
             return_logprob = (
-                obj.return_logprob if index is None else obj.return_logprob[index]
+                obj.return_logprob if not_use_index else obj.return_logprob[index]
             )
             logprob_start_len = (
-                obj.logprob_start_len if index is None else obj.logprob_start_len[index]
+                obj.logprob_start_len if not_use_index else obj.logprob_start_len[index]
             )
             top_logprobs_num = (
-                obj.top_logprobs_num if index is None else obj.top_logprobs_num[index]
+                obj.top_logprobs_num if not_use_index else obj.top_logprobs_num[index]
             )
         else:
             if isinstance(obj.text, list):
@@ -209,7 +227,7 @@ class TokenizerManager:
     async def _handle_batch_request(self, obj: GenerateReqInput, request):
         batch_size = obj.batch_size
-        parallel_sample_num = obj.sampling_params[0].get("n", 1)
+        parallel_sample_num = obj.parallel_sample_num
         if parallel_sample_num != 1:
             # Send prefill requests to cache the common input
@@ -226,7 +244,6 @@ class TokenizerManager:
                 obj.input_ids = input_id_result
             elif input_id_result is not None:
                 obj.input_ids = input_id_result[0]
         # First send out all requests
         for i in range(batch_size):
             for j in range(parallel_sample_num):
@@ -234,7 +251,7 @@ class TokenizerManager:
                     continue
                 index = i * parallel_sample_num + j
                 if parallel_sample_num != 1:
-                    # Here when using parallel sampling we shoul consider prefill stage so the index is :  j + i * (parallel_sample_num-1) + batch_size - 1
+                    # Here when using parallel sampling we should consider prefill stage so the index is :  j + i * (parallel_sample_num-1) + batch_size - 1
                     index += batch_size - 1 - i
                 rid = obj.rid[index]
                 if parallel_sample_num == 1:
@@ -469,7 +486,9 @@ class TokenizerManager:
                 )
         return ret
-    def detokenize_logprob_tokens(self, token_logprobs, decode_to_text: bool):
+    def detokenize_logprob_tokens(
+        self, token_logprobs: List[Tuple[float, int]], decode_to_text: bool
+    ):
         if not decode_to_text:
             return [(logprob, token_id, None) for logprob, token_id in token_logprobs]
@@ -481,9 +500,13 @@ class TokenizerManager:
         ]
     def detokenize_top_logprobs_tokens(self, top_logprobs, decode_to_text: bool):
-        for i, t in enumerate(top_logprobs):
-            if t:
-                top_logprobs[i] = self.detokenize_logprob_tokens(t, decode_to_text)
+        # TODO: The current implementation only batches the detokenization for top-k tokens per single position.
+        # We should batch all top-k tokens in all positions.
+        for i, token_top_logprobs in enumerate(top_logprobs):
+            if token_top_logprobs:
+                top_logprobs[i] = self.detokenize_logprob_tokens(
+                    token_top_logprobs, decode_to_text
+                )
         return top_logprobs

sglang 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

sglang 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl