PyPI - sglang - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

sglang 0.2.7py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

sglang/lang/interpreter.py +2 -1
sglang/lang/ir.py +0 -1
sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +2 -2
sglang/srt/constrained/fsm_cache.py +2 -2
sglang/srt/constrained/jump_forward.py +2 -2
sglang/srt/managers/schedule_batch.py +29 -9
sglang/srt/managers/tp_worker.py +29 -6
sglang/srt/mem_cache/base_cache.py +43 -0
sglang/srt/mem_cache/chunk_cache.py +60 -0
sglang/srt/mem_cache/radix_cache.py +5 -2
sglang/srt/model_executor/model_runner.py +17 -2
sglang/srt/models/llama2.py +5 -21
sglang/srt/openai_api/adapter.py +69 -16
sglang/srt/openai_api/protocol.py +20 -2
sglang/srt/server.py +4 -1
sglang/srt/server_args.py +7 -4
sglang/test/test_programs.py +9 -6
sglang/version.py +1 -1
{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/METADATA +4 -4
{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/RECORD +23 -21
{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/WHEEL +0 -0
{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0

sglang/lang/interpreter.py CHANGED Viewed

@@ -553,7 +553,8 @@ class StreamExecutor:
                 "output_token_logprobs": output_token_logprobs,
             }
             self.variable_event[name].set()
-            self.stream_var_event[name].set()
+            if self.stream_var_event:
+                self.stream_var_event[name].set()
         self.text_ += decision
     def _execute_variable(self, expr: SglVariable):

sglang/lang/ir.py CHANGED Viewed

@@ -99,7 +99,6 @@ class SglSamplingParams:
             "stop": self.stop or None,
             "temperature": self.temperature,
             "top_p": self.top_p,
-            "top_k": self.top_k,
             "frequency_penalty": self.frequency_penalty,
             "presence_penalty": self.presence_penalty,
         }

sglang/srt/constrained/{base_cache.py → base_tool_cache.py} RENAMED Viewed

@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-"""Base cache class."""
+"""Base tool cache for constrained decoding tools."""
 import time
-class BaseCache:
+class BaseToolCache:
     def __init__(self, enable=True):
         self.enable = enable
         self.reset()

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -16,10 +16,10 @@ limitations under the License.
 """Cache for the compressed finite state machine."""
 from sglang.srt.constrained import RegexGuide, TransformerTokenizer
-from sglang.srt.constrained.base_cache import BaseCache
+from sglang.srt.constrained.base_tool_cache import BaseToolCache
-class FSMCache(BaseCache):
+class FSMCache(BaseToolCache):
     def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
         super().__init__(enable=enable)

sglang/srt/constrained/jump_forward.py CHANGED Viewed

@@ -30,7 +30,7 @@ from sglang.srt.constrained import (
     make_byte_level_fsm,
     make_deterministic_fsm,
 )
-from sglang.srt.constrained.base_cache import BaseCache
+from sglang.srt.constrained.base_tool_cache import BaseToolCache
 IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
@@ -151,7 +151,7 @@ class JumpForwardMap:
         )
-class JumpForwardCache(BaseCache):
+class JumpForwardCache(BaseToolCache):
     def __init__(self):
         super().__init__()

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -28,6 +28,7 @@ from flashinfer.sampling import top_k_top_p_sampling_from_probs
 from sglang.global_config import global_config
 from sglang.srt.constrained import RegexGuide
 from sglang.srt.constrained.jump_forward import JumpForwardMap
+from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool
 from sglang.srt.mem_cache.radix_cache import RadixCache
@@ -486,15 +487,33 @@ class Batch:
             req = self.reqs[idx]
             retracted_reqs.append(req)
-            # TODO: apply more fine-grained retraction
-            last_uncached_pos = len(req.prefix_indices)
-            token_indices = self.req_to_token_pool.req_to_token[
-                req_pool_indices_cpu[idx]
-            ][last_uncached_pos : seq_lens_cpu[idx]]
-            self.token_to_kv_pool.free(token_indices)
-            # release the last node
-            self.tree_cache.dec_lock_ref(req.last_node)
+            if isinstance(self.tree_cache, ChunkCache):
+                # ChunkCache does not have eviction
+                token_indices = self.req_to_token_pool.req_to_token[
+                    req_pool_indices_cpu[idx]
+                ][: seq_lens_cpu[idx]]
+                self.token_to_kv_pool.free(token_indices)
+                self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
+                del self.tree_cache.entries[req.rid]
+            else:
+                # TODO: apply more fine-grained retraction
+                last_uncached_pos = len(req.prefix_indices)
+                token_indices = self.req_to_token_pool.req_to_token[
+                    req_pool_indices_cpu[idx]
+                ][last_uncached_pos : seq_lens_cpu[idx]]
+                self.token_to_kv_pool.free(token_indices)
+                self.req_to_token_pool.free(int(req_pool_indices_cpu[idx]))
+                # release the last node
+                self.tree_cache.dec_lock_ref(req.last_node)
+                # NOTE(lsyin): we should use the newly evictable memory instantly.
+                residual_size = (
+                    len(sorted_indices) * global_config.retract_decode_steps
+                    - self.token_to_kv_pool.available_size()
+                )
+                residual_size = max(0, residual_size)
+                self.tree_cache.evict(residual_size, self.token_to_kv_pool.free)
             req.prefix_indices = None
             req.last_node = None
@@ -575,6 +594,7 @@ class Batch:
                     if req_pool_indices_cpu is None:
                         req_pool_indices_cpu = self.req_pool_indices.tolist()
                     self.tree_cache.cache_req(
+                        rid=req.rid,
                         token_ids=cur_all_ids,
                         last_uncached_pos=len(req.prefix_indices),
                         req_pool_idx=req_pool_indices_cpu[i],

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -43,6 +43,7 @@ from sglang.srt.managers.schedule_batch import (
     ForwardMode,
     Req,
 )
+from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.model_config import ModelConfig
 from sglang.srt.model_executor.model_runner import ModelRunner
@@ -144,11 +145,20 @@ class ModelTpServer:
         )
         # Init cache
-        self.tree_cache = RadixCache(
-            req_to_token_pool=self.model_runner.req_to_token_pool,
-            token_to_kv_pool=self.model_runner.token_to_kv_pool,
-            disable=server_args.disable_radix_cache,
-        )
+        if (
+            server_args.chunked_prefill_size is not None
+            and server_args.disable_radix_cache
+        ):
+            self.tree_cache = ChunkCache(
+                req_to_token_pool=self.model_runner.req_to_token_pool,
+                token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            )
+        else:
+            self.tree_cache = RadixCache(
+                req_to_token_pool=self.model_runner.req_to_token_pool,
+                token_to_kv_pool=self.model_runner.token_to_kv_pool,
+                disable=server_args.disable_radix_cache,
+            )
         self.tree_cache_metrics = {"total": 0, "hit": 0}
         self.scheduler = PolicyScheduler(
             self.schedule_policy,
@@ -280,6 +290,14 @@ class ModelTpServer:
                 "KV cache pool leak detected!"
             )
+        if self.req_to_token_pool.can_use_mem_size != self.req_to_token_pool.size:
+            warnings.warn(
+                "Warning: "
+                f"available req slots={self.req_to_token_pool.can_use_mem_size}, "
+                f"total slots={self.req_to_token_pool.size}\n"
+                "Memory pool leak detected!"
+            )
     def handle_generate_request(
         self,
         recv_req: TokenizedGenerateReqInput,
@@ -346,7 +364,10 @@ class ModelTpServer:
         # Compute matched prefix length
         for req in self.waiting_queue:
             req.input_ids = req.origin_input_ids + req.output_ids
-            prefix_indices, last_node = self.tree_cache.match_prefix(req.input_ids)
+            prefix_indices, last_node = self.tree_cache.match_prefix(
+                rid=req.rid,
+                key=req.input_ids,
+            )
             if req.return_logprob:
                 prefix_indices = prefix_indices[: req.logprob_start_len]
             req.extend_input_len = len(req.input_ids) - len(prefix_indices)
@@ -606,6 +627,7 @@ class ModelTpServer:
         req_pool_indices_cpu = batch.req_pool_indices.cpu().numpy()
         for i, req in enumerate(batch.reqs):
             new_prefix_indices, new_last_node = self.tree_cache.cache_req(
+                rid=req.rid,
                 token_ids=tuple(req.input_ids),
                 last_uncached_pos=len(req.prefix_indices),
                 req_pool_idx=req_pool_indices_cpu[i],
@@ -763,6 +785,7 @@ class ModelTpServer:
             for i in finished_indices:
                 req = batch.reqs[i]
                 self.tree_cache.cache_req(
+                    rid=req.rid,
                     token_ids=tuple(req.origin_input_ids + req.output_ids)[:-1],
                     last_uncached_pos=len(req.prefix_indices),
                     req_pool_idx=req_pool_indices_cpu[i],

sglang/srt/mem_cache/base_cache.py ADDED Viewed

@@ -0,0 +1,43 @@
+from abc import ABC, abstractmethod
+class BasePrefixCache(ABC):
+    """Cache can be indexed by either rid or key."""
+    @abstractmethod
+    def reset(self):
+        pass
+    @abstractmethod
+    def match_prefix(self, **kwargs):
+        pass
+    @abstractmethod
+    def insert(self, **kwargs):
+        pass
+    @abstractmethod
+    def cache_req(self, **kwargs):
+        pass
+    @abstractmethod
+    def evict(self, num_tokens, evict_callback):
+        pass
+    @abstractmethod
+    def inc_lock_ref(self, node):
+        pass
+    @abstractmethod
+    def dec_lock_ref(self, node):
+        pass
+    @abstractmethod
+    def evictable_size(self):
+        pass
+    def total_size(self):
+        raise NotImplementedError
+    def pretty_print(self):
+        raise NotImplementedError

sglang/srt/mem_cache/chunk_cache.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Cache for chunked prefill, used when RadixCache is disabled."""
+from sglang.srt.mem_cache.base_cache import BasePrefixCache
+class ChunkCacheEntry:
+    def __init__(self, rid, value):
+        self.rid = rid
+        self.value = value
+class ChunkCache(BasePrefixCache):
+    def __init__(self, req_to_token_pool, token_to_kv_pool):
+        self.disable = True
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool = token_to_kv_pool
+        self.reset()
+    def reset(self):
+        self.entries = {}
+    def match_prefix(self, rid, **kwargs):
+        if rid not in self.entries:
+            return [], None
+        entry = self.entries[rid]
+        return entry.value, entry
+    def cache_req(
+        self, rid, token_ids, req_pool_idx, del_in_memory_pool=True, **kwargs
+    ):
+        indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
+        if del_in_memory_pool:
+            assert rid in self.entries
+            self.req_to_token_pool.free(req_pool_idx)
+            self.token_to_kv_pool.free(indices)
+            return
+        if rid not in self.entries:
+            self.entries[rid] = ChunkCacheEntry(rid, indices)
+        entry = self.entries[rid]
+        entry.value = indices
+        return indices, entry
+    def insert(self):
+        raise NotImplementedError
+    def evict(self, num_tokens, evict_callback):
+        pass
+    def inc_lock_ref(self, node):
+        return 0
+    def dec_lock_ref(self, node):
+        return 0
+    def evictable_size(self):
+        return 0

sglang/srt/mem_cache/radix_cache.py CHANGED Viewed

@@ -23,6 +23,8 @@ from collections import defaultdict
 import torch
+from sglang.srt.mem_cache.base_cache import BasePrefixCache
 class TreeNode:
     def __init__(self):
@@ -46,7 +48,7 @@ def _key_match(key0, key1):
     return i
-class RadixCache:
+class RadixCache(BasePrefixCache):
     def __init__(self, req_to_token_pool, token_to_kv_pool, disable: bool = False):
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool = token_to_kv_pool
@@ -62,7 +64,7 @@ class RadixCache:
         self.root_node.lock_ref = 1
         self.evictable_size_ = 0
-    def match_prefix(self, key):
+    def match_prefix(self, key, **kwargs):
         if self.disable:
             return [], self.root_node
@@ -90,6 +92,7 @@ class RadixCache:
         req_pool_idx,
         del_in_memory_pool=True,
         old_last_node=None,
+        **kwargs,
     ):
         # Insert the request into radix cache
         indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -19,6 +19,7 @@ import importlib
 import importlib.resources
 import logging
 import pkgutil
+import warnings
 from functools import lru_cache
 from typing import Optional, Type
@@ -121,7 +122,11 @@ class ModelRunner:
         # Load the model and create memory pool
         self.load_model()
-        self.init_memory_pool(total_gpu_memory, server_args.max_num_reqs)
+        self.init_memory_pool(
+            total_gpu_memory,
+            server_args.max_num_reqs,
+            server_args.max_total_tokens,
+        )
         self.init_cublas()
         self.init_flash_infer()
@@ -203,8 +208,18 @@ class ModelRunner:
         max_num_token = int(rest_memory * (1 << 30) // cell_size)
         return max_num_token
-    def init_memory_pool(self, total_gpu_memory, max_num_reqs=None):
+    def init_memory_pool(
+        self, total_gpu_memory, max_num_reqs=None, max_total_tokens=None
+    ):
         self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
+        if max_total_tokens is not None:
+            if max_total_tokens > self.max_total_num_tokens:
+                warnings.warn(
+                    f"max_total_tokens={max_total_tokens} is larger than the profiled value "
+                    f"{self.max_total_num_tokens}. "
+                    f"Use the profiled value instead."
+                )
+            self.max_total_num_tokens = min(self.max_total_num_tokens, max_total_tokens)
         if self.max_total_num_tokens <= 0:
             raise RuntimeError(

sglang/srt/models/llama2.py CHANGED Viewed

@@ -26,6 +26,11 @@ from vllm.config import CacheConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -38,10 +43,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.model_executor.model_runner import InputMetadata
-MergedColumnParallelLinear = None
-QKVParallelLinear = None
-RowParallelLinear = None
 class LlamaMLP(nn.Module):
     def __init__(
@@ -295,23 +296,6 @@ class LlamaForCausalLM(nn.Module):
         cache_config: Optional[CacheConfig] = None,
         efficient_weight_load=False,
     ) -> None:
-        global MergedColumnParallelLinear
-        global QKVParallelLinear
-        global RowParallelLinear
-        if efficient_weight_load:
-            from sglang.srt.layers.linear import (
-                MergedColumnParallelLinear,
-                QKVParallelLinear,
-                RowParallelLinear,
-            )
-        else:
-            from vllm.model_executor.layers.linear import (
-                MergedColumnParallelLinear,
-                QKVParallelLinear,
-                RowParallelLinear,
-            )
         super().__init__()
         self.config = config
         self.quant_config = quant_config

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -43,7 +43,9 @@ from sglang.srt.openai_api.protocol import (
     ChatCompletionResponseChoice,
     ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse,
+    ChatCompletionTokenLogprob,
     ChatMessage,
+    ChoiceLogprobs,
     CompletionRequest,
     CompletionResponse,
     CompletionResponseChoice,
@@ -54,6 +56,7 @@ from sglang.srt.openai_api.protocol import (
     FileRequest,
     FileResponse,
     LogProbs,
+    TopLogprob,
     UsageInfo,
 )
@@ -70,7 +73,7 @@ class FileMetadata:
 batch_storage: Dict[str, BatchResponse] = {}
 file_id_request: Dict[str, FileMetadata] = {}
 file_id_response: Dict[str, FileResponse] = {}
-## map file id to file path in SGlang backend
+# map file id to file path in SGlang backend
 file_id_storage: Dict[str, str] = {}
@@ -261,7 +264,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
             failed_requests += len(file_request_list)
         for idx, response in enumerate(responses):
-            ## the batch_req here can be changed to be named within a batch granularity
+            # the batch_req here can be changed to be named within a batch granularity
             response_json = {
                 "id": f"batch_req_{uuid.uuid4()}",
                 "custom_id": file_request_list[idx].get("custom_id"),
@@ -333,6 +336,8 @@ def v1_generate_request(all_requests):
     prompts = []
     sampling_params_list = []
+    return_logprobs = []
+    top_logprobs_nums = []
     first_prompt_type = type(all_requests[0].prompt)
     for request in all_requests:
         prompt = request.prompt
@@ -340,6 +345,10 @@ def v1_generate_request(all_requests):
             type(prompt) == first_prompt_type
         ), "All prompts must be of the same type in file input settings"
         prompts.append(prompt)
+        return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
+        top_logprobs_nums.append(
+            request.logprobs if request.logprobs is not None else 0
+        )
         sampling_params_list.append(
             {
                 "temperature": request.temperature,
@@ -361,7 +370,9 @@ def v1_generate_request(all_requests):
     if len(all_requests) == 1:
         prompt = prompts[0]
         sampling_params_list = sampling_params_list[0]
-        if isinstance(prompts, str) or isinstance(prompts[0], str):
+        return_logprobs = return_logprobs[0]
+        top_logprobs_nums = top_logprobs_nums[0]
+        if isinstance(prompt, str) or isinstance(prompt[0], str):
             prompt_kwargs = {"text": prompt}
         else:
             prompt_kwargs = {"input_ids": prompt}
@@ -370,15 +381,11 @@ def v1_generate_request(all_requests):
             prompt_kwargs = {"text": prompts}
         else:
             prompt_kwargs = {"input_ids": prompts}
     adapted_request = GenerateReqInput(
         **prompt_kwargs,
         sampling_params=sampling_params_list,
-        return_logprob=all_requests[0].logprobs is not None
-        and all_requests[0].logprobs > 0,
-        top_logprobs_num=(
-            all_requests[0].logprobs if all_requests[0].logprobs is not None else 0
-        ),
+        return_logprob=return_logprobs,
+        top_logprobs_num=top_logprobs_nums,
         return_text_in_logprobs=True,
         stream=all_requests[0].stream,
     )
@@ -430,7 +437,7 @@ def v1_generate_response(request, ret, to_file=False):
             logprobs = None
         if to_file:
-            ## to make the choise data json serializable
+            # to make the choise data json serializable
             choice_data = {
                 "index": 0,
                 "text": text,
@@ -454,7 +461,7 @@ def v1_generate_response(request, ret, to_file=False):
                 "status_code": 200,
                 "request_id": ret[i]["meta_info"]["id"],
                 "body": {
-                    ## remain the same but if needed we can change that
+                    # remain the same but if needed we can change that
                     "id": ret[i]["meta_info"]["id"],
                     "object": "text_completion",
                     "created": int(time.time()),
@@ -590,6 +597,8 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
     texts = []
     sampling_params_list = []
     image_data_list = []
+    return_logprobs = []
+    top_logprobs_nums = []
     for request in all_requests:
         # Prep the data needed for the underlying GenerateReqInput:
         #  - prompt: The full prompt string.
@@ -620,6 +629,8 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
             stop = request.stop
             image_data = None
         texts.append(prompt)
+        return_logprobs.append(request.logprobs)
+        top_logprobs_nums.append(request.top_logprobs)
         sampling_params_list.append(
             {
                 "temperature": request.temperature,
@@ -637,11 +648,16 @@ def v1_chat_generate_request(all_requests, tokenizer_manager):
         texts = texts[0]
         sampling_params_list = sampling_params_list[0]
         image_data = image_data_list[0]
+        return_logprobs = return_logprobs[0]
+        top_logprobs_nums = top_logprobs_nums[0]
     adapted_request = GenerateReqInput(
         text=texts,
         image_data=image_data,
         sampling_params=sampling_params_list,
-        stream=request.stream,
+        return_logprob=return_logprobs,
+        top_logprobs_num=top_logprobs_nums,
+        stream=all_requests[0].stream,
+        return_text_in_logprobs=True,
     )
     if len(all_requests) == 1:
         return adapted_request, all_requests[0]
@@ -654,26 +670,63 @@ def v1_chat_generate_response(request, ret, to_file=False):
     total_completion_tokens = 0
     for idx, ret_item in enumerate(ret):
+        logprobs = False
+        if isinstance(request, List) and request[idx].logprobs:
+            logprobs = True
+        elif (not isinstance(request, List)) and request.logprobs:
+            logprobs = True
+        if logprobs:
+            logprobs = to_openai_style_logprobs(
+                output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
+                output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
+            )
+            token_logprobs = []
+            for token, logprob in zip(logprobs.tokens, logprobs.token_logprobs):
+                token_bytes = list(token.encode("utf-8"))
+                top_logprobs = []
+                if logprobs.top_logprobs:
+                    for top_token, top_logprob in logprobs.top_logprobs[0].items():
+                        top_token_bytes = list(top_token.encode("utf-8"))
+                        top_logprobs.append(
+                            TopLogprob(
+                                token=top_token,
+                                bytes=top_token_bytes,
+                                logprob=top_logprob,
+                            )
+                        )
+                token_logprobs.append(
+                    ChatCompletionTokenLogprob(
+                        token=token,
+                        bytes=token_bytes,
+                        logprob=logprob,
+                        top_logprobs=top_logprobs,
+                    )
+                )
+            choice_logprobs = ChoiceLogprobs(content=token_logprobs)
+        else:
+            choice_logprobs = None
         prompt_tokens = ret_item["meta_info"]["prompt_tokens"]
         completion_tokens = ret_item["meta_info"]["completion_tokens"]
         if to_file:
-            ## to make the choice data json serializable
+            # to make the choice data json serializable
             choice_data = {
                 "index": 0,
                 "message": {"role": "assistant", "content": ret_item["text"]},
-                "logprobs": None,
+                "logprobs": choice_logprobs,
                 "finish_reason": ret_item["meta_info"]["finish_reason"],
             }
         else:
             choice_data = ChatCompletionResponseChoice(
                 index=idx,
                 message=ChatMessage(role="assistant", content=ret_item["text"]),
+                logprobs=choice_logprobs,
                 finish_reason=ret_item["meta_info"]["finish_reason"],
             )
         choices.append(choice_data)
-        total_prompt_tokens = prompt_tokens
+        total_prompt_tokens += prompt_tokens
         total_completion_tokens += completion_tokens
     if to_file:
         responses = []
@@ -683,7 +736,7 @@ def v1_chat_generate_response(request, ret, to_file=False):
                 "status_code": 200,
                 "request_id": ret[i]["meta_info"]["id"],
                 "body": {
-                    ## remain the same but if needed we can change that
+                    # remain the same but if needed we can change that
                     "id": ret[i]["meta_info"]["id"],
                     "object": "chat.completion",
                     "created": int(time.time()),

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -54,6 +54,24 @@ class LogProbs(BaseModel):
     top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
+class TopLogprob(BaseModel):
+    token: str
+    bytes: List[int]
+    logprob: float
+class ChatCompletionTokenLogprob(BaseModel):
+    token: str
+    bytes: List[int]
+    logprob: float
+    top_logprobs: List[TopLogprob]
+class ChoiceLogprobs(BaseModel):
+    # build for v1/chat/completions response
+    content: List[ChatCompletionTokenLogprob]
 class UsageInfo(BaseModel):
     prompt_tokens: int = 0
     total_tokens: int = 0
@@ -239,8 +257,8 @@ class ChatMessage(BaseModel):
 class ChatCompletionResponseChoice(BaseModel):
     index: int
     message: ChatMessage
-    logprobs: Optional[LogProbs] = None
-    finish_reason: Optional[str] = None
+    logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
+    finish_reason: str
 class ChatCompletionResponse(BaseModel):

sglang/srt/server.py CHANGED Viewed

@@ -260,7 +260,7 @@ def launch_server(
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.2",
+            "0.1.3",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
@@ -479,6 +479,9 @@ class Runtime:
             parent.wait(timeout=5)
             self.pid = None
+    def cache_prefix(self, prefix: str):
+        self.endpoint.cache_prefix(prefix)
     def get_tokenizer(self):
         return get_tokenizer(
             self.server_args.tokenizer_path,

sglang/srt/server_args.py CHANGED Viewed

@@ -44,6 +44,7 @@ class ServerArgs:
     max_prefill_tokens: Optional[int] = None
     max_running_requests: Optional[int] = None
     max_num_reqs: Optional[int] = None
+    max_total_tokens: Optional[int] = None
     schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
@@ -231,6 +232,12 @@ class ServerArgs:
             default=ServerArgs.max_num_reqs,
             help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
         )
+        parser.add_argument(
+            "--max-total-tokens",
+            type=int,
+            default=ServerArgs.max_total_tokens,
+            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
+        )
         parser.add_argument(
             "--schedule-policy",
             type=str,
@@ -412,10 +419,6 @@ class ServerArgs:
             self.dp_size > 1 and self.node_rank is not None
         ), "multi-node data parallel is not supported"
-        assert not (
-            self.chunked_prefill_size is not None and self.disable_radix_cache
-        ), "chunked prefill is not supported with radix cache disabled currently"
 @dataclasses.dataclass
 class PortArgs:

sglang/test/test_programs.py CHANGED Viewed

@@ -113,15 +113,14 @@ def test_decode_json_regex():
             s += '  "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
             s += '  "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
             s += '  "latitude": ' + sgl.gen(regex=REGEX_FLOAT + ",") + "\n"
-            s += '  "country": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n"
-            s += '  "timezone": ' + sgl.gen(regex=REGEX_STRING) + "\n"
+            s += '  "country": ' + sgl.gen(regex=REGEX_STRING) + "\n"
             s += "}"
-    ret = decode_json.run()
+    ret = decode_json.run(temperature=0.0)
     try:
         js_obj = json.loads(ret["json_output"])
     except json.decoder.JSONDecodeError:
-        print(ret["json_output"])
+        print("JSONDecodeError", ret["json_output"])
         raise
     assert isinstance(js_obj["name"], str)
     assert isinstance(js_obj["population"], int)
@@ -141,8 +140,12 @@ def test_decode_json():
             s += '  "timezone": ' + sgl.gen(dtype=str) + "\n"
             s += "}"
-    ret = decode_json.run()
-    js_obj = json.loads(ret["json_output"])
+    ret = decode_json.run(max_new_tokens=64)
+    try:
+        js_obj = json.loads(ret["json_output"])
+    except json.decoder.JSONDecodeError:
+        print("JSONDecodeError", ret["json_output"])
+        raise
     assert isinstance(js_obj["name"], str)
     assert isinstance(js_obj["population"], int)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.7"
1	+ __version__ = "0.2.8"

{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.7
+Version: 0.2.8
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -299,8 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ### Method 2: From source
 ```
-# Use the stable release branch
-git clone -b release https://github.com/sgl-project/sglang.git
+# Use the stable v0.2.8 branch
+git clone -b v0.2.8 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ### Method 3: Using docker
 The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
-Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
+Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
 ```bash
 docker run --gpus all \

{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/RECORD RENAMED Viewed

@@ -7,12 +7,12 @@ sglang/global_config.py,sha256=CyhGL7PE-KlMcg7IHWykzImU1y4NQlpeIlh9lHA77uo,1749
 sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
 sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
 sglang/utils.py,sha256=r0Z7hY_bFFk-b6WeQJir9br-hCW2-p7n5E7Et2WziaQ,8776
-sglang/version.py,sha256=XHypfHSPdgXFKmOdoewn7czU670gt8InhHhzlP5j_aA,22
+sglang/version.py,sha256=G6Dbxq2ws-1ZAXwDD8q0KWueYtso_Y6Uyvtj8sRWsPI,22
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
 sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
-sglang/lang/interpreter.py,sha256=dt_NAAMv2oSYxwSMjhMr2pIGTe5_d12cSR91SUWvpCQ,30298
-sglang/lang/ir.py,sha256=THa6hwnuTVXVYxnovNQP_o7A9v5O8uXE4eLXH9vDRLA,16648
+sglang/lang/interpreter.py,sha256=_MbvYB0vweCgALklpM2DlofiCXuITCmX_fl8rPPcp5U,30340
+sglang/lang/ir.py,sha256=0r-mhA4aO-uuS97Dvkw99ERTcJXfzuV6jJQMmuCwHEg,16615
 sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
 sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
@@ -26,13 +26,13 @@ sglang/srt/hf_transformers_utils.py,sha256=Fg-3panb6lsqOhHmAYA0ivkXyBjdnvY5mqvil
 sglang/srt/mm_utils.py,sha256=n7_GmbOM_0IWVXovpM34rKIBw0Py9yb_NXSQw27u4OA,9454
 sglang/srt/model_config.py,sha256=DO7m84WiT3dzPWmyKz_UXDAHEdqEjq8Lq5wCjzjYMME,6023
 sglang/srt/sampling_params.py,sha256=uZFDlTUPnNR5_3IDH-INDeN-tm6LlRkC2KT-B3njxJs,3687
-sglang/srt/server.py,sha256=2qgluP7_6-e36PDK_mr-rLK9us3_9KvXLG3255h-tS4,16022
-sglang/srt/server_args.py,sha256=0cV-r5QTV_9Arl3hVf9mc20BlOhYhWSkICU0T3dS180,15412
+sglang/srt/server.py,sha256=8uDMWGAp2EZ8bywQumEa6T2G2k78-oYXgLfk6qBkv8o,16107
+sglang/srt/server_args.py,sha256=zGAbZqKKN4dkn5BDcZdjxLM-jIFsHX2ThAEfvPKUm6c,15645
 sglang/srt/utils.py,sha256=uIatocIFzqi6fWSscz2MjF3jUcIRBJlqLgYeicM_W9s,22950
 sglang/srt/constrained/__init__.py,sha256=NLpZGj9RIx83ejDrM_pfaRtqGgaPq_ggJszPQENUJ2E,2037
-sglang/srt/constrained/base_cache.py,sha256=Aeu2HMPhXMPNQNEwPJ19sECN0PYPZKjisrZiCcocHiw,1970
-sglang/srt/constrained/fsm_cache.py,sha256=Q7wfGx7VOghErqcC_0kK4aI-lBEO9TxoFPyUiBxEGVE,2626
-sglang/srt/constrained/jump_forward.py,sha256=SYKj5Pd3d7oym5fAI8zUzj3zKk-lV30m_ksAy0ubgO8,6180
+sglang/srt/constrained/base_tool_cache.py,sha256=1_m-AivPtWRwUgGiEZBafCrSFUGahK4UM4vgAd8TkMg,2004
+sglang/srt/constrained/fsm_cache.py,sha256=GoPBr_9ZdJizF2PKbYoQw2I4ckfrUYwCeMZxB9sY3TM,2639
+sglang/srt/constrained/jump_forward.py,sha256=IgZ8D0woy5FLIQvXkE8wZRYejDsfVkjU0sqUlkiv_f4,6193
 sglang/srt/layers/context_flashattention_nopad.py,sha256=r_TpHuYAVgq1pN81PiWe1bebtY-p9MBndBaoIE2VXrk,5180
 sglang/srt/layers/extend_attention.py,sha256=zuNnAdL_wF6BX0Mwn1dgDJvh3YJjYwqa5Fbzp8muOVc,12573
 sglang/srt/layers/fused_moe.py,sha256=KmyXwau2OOZpQimGIQrHptzGNs1trIud5AKEEKXdzPU,20823
@@ -47,14 +47,16 @@ sglang/srt/managers/controller_single.py,sha256=CdQ9_XPZdcWF5jArDmVR8K-WZ9_8Gpgk
 sglang/srt/managers/detokenizer_manager.py,sha256=GXWdW4n2N-otL3zcgdr0t1PcEe2EmQJA8AElntiNV1o,5606
 sglang/srt/managers/io_struct.py,sha256=Rz7Ur9Yw6prDGdy6XjsSiUmVBccS6cef-G_9TW7HA_4,7105
 sglang/srt/managers/policy_scheduler.py,sha256=ajSB-gCC6VJkXvnKU8FYU3Kgcigozp2pMTwF84Wp14o,3138
-sglang/srt/managers/schedule_batch.py,sha256=tbos5i4KSfk1K8VH5HCNm2pQGlJMKVAE_mZ8haVMelc,36620
+sglang/srt/managers/schedule_batch.py,sha256=LIoVCPNivh0u1dOrrWRgFD6a4ywq3nrG_4dNgCK0kIw,37697
 sglang/srt/managers/tokenizer_manager.py,sha256=tEct3shjjw_7ickj_cmt9IxoBHfgbryQHI7DZS0m4TA,20511
-sglang/srt/managers/tp_worker.py,sha256=91gbWi7hSuyTC3Qvo7EXKmHM6GKWTK0Nqpda001jOw0,34349
+sglang/srt/managers/tp_worker.py,sha256=JPLneFwcPlmPXZX1QxZHWgcdau8FC8wNuVqfCqsgOkU,35234
+sglang/srt/mem_cache/base_cache.py,sha256=czyN8IumXcMQskYOZDV3DzjfD4kdR-qwLVxceDqnOmE,788
+sglang/srt/mem_cache/chunk_cache.py,sha256=u1mkGoTI7_31H0i0mhKT7S57StYSsdmsSPqyGubE7lY,1560
 sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
 sglang/srt/mem_cache/memory_pool.py,sha256=wkhjyYLbAZrl2FB5i4ODkxgMufBuDpe4N0kbXhu6ZO0,4509
-sglang/srt/mem_cache/radix_cache.py,sha256=Xk0c8nwyPHEUsobVJQrr7edwyzUMk9MBYTQBprN8a0Y,8775
+sglang/srt/mem_cache/radix_cache.py,sha256=pa5RD4xNKPSuvL55BnC4mimoca5oJRXr4Rg91-sbTcs,8881
 sglang/srt/model_executor/cuda_graph_runner.py,sha256=OdmO6R7nHWrRJCtZOxYkt0KNdGoX7Md4knsypwPYjaQ,9365
-sglang/srt/model_executor/model_runner.py,sha256=WyPsO73MD3ziKAk76j4HemePYZluXjs9WGYeajUgfQA,15507
+sglang/srt/model_executor/model_runner.py,sha256=fo3fbnNaHkcHz2UDkyvFjU7sGvdClhmhdelQh0n9PgA,16079
 sglang/srt/model_loader/model_loader.py,sha256=QmZUhHh1nmWrfYlunfnxMcTsIvip1l6aMIlrXoCED4I,10697
 sglang/srt/model_loader/utils.py,sha256=0AoWXX9uV5rKRYXJ4HduSnvdeerytI4ONCLCH6X4XFQ,10675
 sglang/srt/models/chatglm.py,sha256=vYWooqyPmcSFZNjxj_g5I_FgHJlDytbEiz6vyv3JBNM,13856
@@ -67,7 +69,7 @@ sglang/srt/models/gemma2.py,sha256=kTjZcsptgtYaO8BL_NlygjVSMSloq2Mc4Rf3FKvEhbs,1
 sglang/srt/models/gpt_bigcode.py,sha256=U7GmHKywSu12D-EwvuWv3RwHkx6bPawaRIjlFIpQkfs,10194
 sglang/srt/models/grok.py,sha256=NfZdsRVErDIUWFqjhtNf2pqC9G4cRdYHBFpgDq1IZ2A,27855
 sglang/srt/models/internlm2.py,sha256=Ld2GUxZeqqqJ2vd4QiX2s1y2AceJLA1nVnUYY88GMQk,12219
-sglang/srt/models/llama2.py,sha256=zhoCUh_3dNC7FOzDnaoHcHF3-y7vTVYDZzHKqIsUJgs,14764
+sglang/srt/models/llama2.py,sha256=zfOk3OK1_B6s6yuXsZFmNCf07RsfytVD72GunLBt8Cc,14282
 sglang/srt/models/llama_classification.py,sha256=4r_orFZqBR3U_yC4bus1K3Z3-ADscYGSzgA82_VDN0g,4926
 sglang/srt/models/llava.py,sha256=BJphgyQGdo7uTpJcKGEfWwdpH9GTMDnyiznLSSgmvm8,18476
 sglang/srt/models/llavavid.py,sha256=-7vaVqaIfukCvMkNakEPblpwjIHC6ezrAvmpE5RzlUY,13602
@@ -80,14 +82,14 @@ sglang/srt/models/qwen2.py,sha256=mXlVd6UTCXY3VdgodFpQnlaY-NYLIbA-SknxdA9R13w,12
 sglang/srt/models/qwen2_moe.py,sha256=YYdJEezic7GyW-_bXlNIaqBa0C4IHQpz_vuRBLxms4k,18141
 sglang/srt/models/stablelm.py,sha256=b3d-ZwLQoLjZ6CupnkIq7d-z9tzGSxAyIcgSmZiZxZw,11362
 sglang/srt/models/yivl.py,sha256=p4s_D_m4H2exP4b91Y-CTkq8T-eIG3DJsFy9pB0e7TM,4932
-sglang/srt/openai_api/adapter.py,sha256=Jn8Awi93zkb3Wq5gqK698kOhmqYdtxZlRePciA50Ud4,30213
-sglang/srt/openai_api/protocol.py,sha256=_mBNdxb_4ZRIeP0wmW8tMTc2x7zu4foVxBDCuCWkaiw,7822
+sglang/srt/openai_api/adapter.py,sha256=MaWz78cvkk5RdotRMCIf_K5xYAClX7TonjxH_dzUrVI,32495
+sglang/srt/openai_api/protocol.py,sha256=JXLnnQ63I-bJv93ICPfP0cBpyomQA5IYE_mkUg5X4Es,8177
 sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
 sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
-sglang/test/test_programs.py,sha256=s4WGpTmYP4Yx5g8JYZpbkeF9RN5iUnlKdi8FGAZovTc,13756
+sglang/test/test_programs.py,sha256=0M8blaIy--eEE2dQnG4FyjIETT_wa7eEG3S9UWna6_4,13851
 sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
-sglang-0.2.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sglang-0.2.7.dist-info/METADATA,sha256=NU4S55-t6q87AKPkgbDORvX_Om0XbAJ9K67_p30JnQ0,33216
-sglang-0.2.7.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-sglang-0.2.7.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.2.7.dist-info/RECORD,,
+sglang-0.2.8.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.2.8.dist-info/METADATA,sha256=FRkxB6W7NQlj9ar65-oppfES5tc1pS8LRPJXU-43hsQ,33214
+sglang-0.2.8.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+sglang-0.2.8.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.2.8.dist-info/RECORD,,

{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.2.7.dist-info → sglang-0.2.8.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl

sglang 0.2.7py3-none-any.whl → 0.2.8py3-none-any.whl