PyPI - sglang - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.0.post2__py3-none-any.whl - Mend

sglang 0.4.0py3-none-any.whl → 0.4.0.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

sglang/__init__.py +1 -1
sglang/bench_offline_throughput.py +18 -6
sglang/bench_one_batch.py +13 -0
sglang/bench_serving.py +8 -1
sglang/check_env.py +140 -48
sglang/lang/backend/runtime_endpoint.py +1 -0
sglang/lang/chat_template.py +32 -0
sglang/llama3_eval.py +316 -0
sglang/srt/constrained/outlines_backend.py +5 -0
sglang/srt/constrained/xgrammar_backend.py +9 -6
sglang/srt/layers/attention/__init__.py +5 -2
sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
sglang/srt/layers/attention/flashinfer_backend.py +22 -5
sglang/srt/layers/attention/torch_native_backend.py +22 -8
sglang/srt/layers/attention/triton_backend.py +38 -33
sglang/srt/layers/attention/triton_ops/decode_attention.py +305 -350
sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
sglang/srt/layers/ep_moe/__init__.py +0 -0
sglang/srt/layers/ep_moe/kernels.py +349 -0
sglang/srt/layers/ep_moe/layer.py +665 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +64 -21
sglang/srt/layers/fused_moe_triton/layer.py +1 -1
sglang/srt/layers/logits_processor.py +133 -95
sglang/srt/layers/quantization/__init__.py +2 -47
sglang/srt/layers/quantization/fp8.py +607 -0
sglang/srt/layers/quantization/fp8_utils.py +27 -0
sglang/srt/layers/radix_attention.py +11 -2
sglang/srt/layers/sampler.py +29 -5
sglang/srt/layers/torchao_utils.py +58 -45
sglang/srt/managers/detokenizer_manager.py +37 -17
sglang/srt/managers/io_struct.py +39 -10
sglang/srt/managers/schedule_batch.py +39 -24
sglang/srt/managers/schedule_policy.py +64 -5
sglang/srt/managers/scheduler.py +236 -197
sglang/srt/managers/tokenizer_manager.py +99 -58
sglang/srt/managers/tp_worker_overlap_thread.py +7 -5
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +2 -2
sglang/srt/mem_cache/memory_pool.py +5 -1
sglang/srt/mem_cache/radix_cache.py +12 -2
sglang/srt/model_executor/cuda_graph_runner.py +39 -11
sglang/srt/model_executor/model_runner.py +24 -9
sglang/srt/model_parallel.py +67 -10
sglang/srt/models/commandr.py +2 -2
sglang/srt/models/deepseek_v2.py +87 -7
sglang/srt/models/gemma2.py +34 -0
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/granite.py +517 -0
sglang/srt/models/grok.py +72 -13
sglang/srt/models/llama.py +22 -5
sglang/srt/models/llama_classification.py +11 -23
sglang/srt/models/llama_reward.py +0 -2
sglang/srt/models/llava.py +37 -14
sglang/srt/models/mixtral.py +12 -9
sglang/srt/models/phi3_small.py +0 -5
sglang/srt/models/qwen2.py +20 -0
sglang/srt/models/qwen2_moe.py +0 -5
sglang/srt/models/torch_native_llama.py +0 -5
sglang/srt/openai_api/adapter.py +4 -0
sglang/srt/openai_api/protocol.py +9 -4
sglang/srt/sampling/sampling_batch_info.py +9 -8
sglang/srt/server.py +4 -4
sglang/srt/server_args.py +62 -13
sglang/srt/utils.py +57 -10
sglang/test/test_utils.py +3 -2
sglang/utils.py +10 -3
sglang/version.py +1 -1
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/METADATA +15 -9
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/RECORD +72 -65
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/LICENSE +0 -0
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.0.dist-info → sglang-0.4.0.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -51,7 +51,6 @@ class Sampler(nn.Module):
             # Post process logits
             logits.div_(sampling_info.temperatures)
             probs = torch.softmax(logits, dim=-1)
-            logits = None
             del logits
             if global_server_args_dict["sampling_backend"] == "flashinfer":
@@ -84,6 +83,7 @@ class Sampler(nn.Module):
                     sampling_info.top_ks,
                     sampling_info.top_ps,
                     sampling_info.min_ps,
+                    sampling_info.need_min_p_sampling,
                 )
             else:
                 raise ValueError(
@@ -98,18 +98,42 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     top_ks: torch.Tensor,
     top_ps: torch.Tensor,
     min_ps: torch.Tensor,
+    need_min_p_sampling: bool,
 ):
     """A top-k, top-p and min-p sampling implementation with native pytorch operations."""
     probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
     probs_sum = torch.cumsum(probs_sort, dim=-1)
-    min_p_thresholds = probs_sort[:, 0] * min_ps
-    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
     probs_sort[
         torch.arange(0, probs.shape[-1], device=probs.device).view(1, -1)
         >= top_ks.view(-1, 1)
     ] = 0.0
-    probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
-    probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0])
+    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+    if need_min_p_sampling:
+        min_p_thresholds = probs_sort[:, 0] * min_ps
+        probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
     sampled_index = torch.multinomial(probs_sort, num_samples=1)
+    # int32 range is enough to represent the token ids
+    probs_idx = probs_idx.to(torch.int32)
     batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1)
     return batch_next_token_ids
+def top_p_normalize_probs(
+    probs: torch.Tensor,
+    top_ps: torch.Tensor,
+):
+    if global_server_args_dict["sampling_backend"] == "flashinfer":
+        return top_p_renorm_prob(probs, top_ps)
+    elif global_server_args_dict["sampling_backend"] == "pytorch":
+        # See also top_k_top_p_min_p_sampling_from_probs_torch
+        probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+        probs_sum = torch.cumsum(probs_sort, dim=-1)
+        probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+        return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
+    else:
+        raise ValueError(
+            f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
+        )

sglang/srt/layers/torchao_utils.py CHANGED Viewed

@@ -2,23 +2,24 @@
 Common utilities for torchao.
 """
-from typing import Dict, Set
 import torch
-def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
-    """Quantize a Tensor with torchao quantization specified by torchao_config
+def apply_torchao_config_to_model(
+    model: torch.nn.Module, torchao_config: str, filter_fn=None
+):
+    """Quantize a modelwith torchao quantization specified by torchao_config
     Args:
-       `param`: weight parameter of the linear module
-       `torchao_config`: type of quantization and their arguments we want to use to
-        quantize the Tensor, e.g. int4wo-128 means int4 weight only quantization with group_size
+       `model`: a model to be quantized based on torchao_config
+       `torchao_config` (str): type of quantization and their arguments we want to use to
+        quantize the model, e.g. int4wo-128 means int4 weight only quantization with group_size
         128
     """
     # Lazy import to suppress some warnings
     from torchao.quantization import (
         float8_dynamic_activation_float8_weight,
+        float8_weight_only,
         int4_weight_only,
         int8_dynamic_activation_int8_weight,
         int8_weight_only,
@@ -26,12 +27,17 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
     )
     from torchao.quantization.observer import PerRow, PerTensor
-    dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False)
-    dummy_linear.weight = param
-    if "int8wo" in torchao_config:
-        quantize_(dummy_linear, int8_weight_only())
+    if filter_fn is None:
+        def filter_fn(module, fqn):
+            return "proj" in fqn
+    if torchao_config == "" or torchao_config is None:
+        return model
+    elif "int8wo" in torchao_config:
+        quantize_(model, int8_weight_only(), filter_fn=filter_fn)
     elif "int8dq" in torchao_config:
-        quantize_(dummy_linear, int8_dynamic_activation_int8_weight())
+        quantize_(model, int8_dynamic_activation_int8_weight(), filter_fn=filter_fn)
     elif "int4wo" in torchao_config:
         group_size = int(torchao_config.split("-")[-1])
         assert group_size in [
@@ -40,13 +46,46 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
             128,
             256,
         ], f"int4wo groupsize needs to be one of [32, 64, 128, 256] but got {group_size}"
-        quantize_(dummy_linear, int4_weight_only(group_size=group_size))
-    elif "fp8wo" in torchao_config:
-        from torchao.quantization import float8_weight_only
+        quantize_(model, int4_weight_only(group_size=group_size), filter_fn=filter_fn)
+    elif "gemlite" in torchao_config:
+        # gemlite-<packing_bitwidth>-<bit_width>-<group_size> or
+        # gemlite-<bit_width>-<group_size> (packing_bitwidth defaults to 32)
+        import os
+        import pwd
+        import gemlite
+        from gemlite.core import GemLiteLinearTriton, set_autotune
+        try:
+            from torchao.quantization import gemlite_uintx_weight_only
+        except:
+            print(
+                f"import `gemlite_uintx_weight_only` failed, please use torchao nightly to use gemlite quantization"
+            )
+            return model
+        _quant_args = torchao_config.split("-")
+        bit_width = int(_quant_args[-2])
+        group_size = None if _quant_args[-1] == "None" else int(_quant_args[-1])
+        try:
+            packing_bitwidth = int(_quant_args[-3])
+        except:
+            # if only 2 inputs found, use default value
+            packing_bitwidth = 32
+        quantize_(
+            model, gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth)
+        )
+        # try to load gemlite kernel config
+        GemLiteLinearTriton.load_config(
+            f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+        )
+    elif "fp8wo" in torchao_config:
         # this requires newer hardware
         # [rank0]: AssertionError: fp8e4nv data type is not supported on CUDA arch < 89
-        quantize_(dummy_linear, float8_weight_only())
+        quantize_(model, float8_weight_only(), filter_fn=filter_fn)
     elif "fp8dq" in torchao_config:
         granularity = torchao_config.split("-")[-1]
         GRANULARITY_MAP = {
@@ -57,39 +96,13 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
             granularity in GRANULARITY_MAP
         ), f"Supported granularity are: {GRANULARITY_MAP.keys()}, got {granularity}"
         quantize_(
-            dummy_linear,
+            model,
             float8_dynamic_activation_float8_weight(
                 granularity=GRANULARITY_MAP[granularity]
             ),
+            filter_fn=filter_fn,
         )
     else:
         raise ValueError(f"Unexpected config: {torchao_config}")
-    return dummy_linear.weight
-def apply_torchao_config_(
-    self: torch.nn.Module,
-    params_dict: Dict[str, torch.Tensor],
-    param_suffixes: Set[str],
-) -> None:
-    """A util function used for quantizing the weight parameters after they are loaded if
-       self.torchao_config is specified
-    Args:
-      `self`: the model we want to quantize
-      `params_dict`: dictionary mapping from param_name to the parameter Tensor
-      `param_suffixes`: a set of suffixes, we'll quantize the Tensor matching these suffixes
-    Returns:
-       None, the `params_dict` is modified inplace and the weights of `self` model are quantized
-    """
-    if self.torchao_config:
-        for param_suffix in param_suffixes:
-            for name in params_dict:
-                param = params_dict[name]
-                if param_suffix in name and param.ndim == 2:
-                    params_dict[name] = torchao_quantize_param_data(
-                        param, self.torchao_config
-                    )
-        self.load_state_dict(params_dict, assign=True)
+    return model

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -17,9 +17,10 @@ import dataclasses
 import logging
 import signal
 from collections import OrderedDict
-from typing import List, Union
+from typing import Dict, List, Union
 import psutil
+import setproctitle
 import zmq
 from sglang.srt.hf_transformers_utils import get_tokenizer
@@ -28,7 +29,6 @@ from sglang.srt.managers.io_struct import (
     BatchStrOut,
     BatchTokenIDOut,
 )
-from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import configure_logger, get_zmq_socket
 from sglang.utils import find_printable_text, get_exception_traceback
@@ -75,17 +75,25 @@ class DetokenizerManager:
         self.decode_status = LimitedCapacityDict()
-    def trim_eos(self, output: Union[str, List[int]], finished_reason, no_stop_trim):
-        if no_stop_trim:
+    def trim_matched_stop(
+        self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
+    ):
+        if no_stop_trim or not finished_reason:
+            return output
+        matched = finished_reason.get("matched", None)
+        if not matched:
             return output
-        # Trim stop str. TODO(lmzheng): handle the case where multiple stop strs are hit
-        if isinstance(finished_reason, FINISH_MATCHED_STR) and isinstance(output, str):
-            pos = output.find(finished_reason.matched)
+        # TODO(lmzheng): handle the case where multiple stop strs are hit
+        # Trim stop str.
+        if isinstance(matched, str) and isinstance(output, str):
+            pos = output.find(matched)
             return output[:pos] if pos != -1 else output
-        if isinstance(finished_reason, FINISH_MATCHED_TOKEN) and isinstance(
-            output, list
-        ):
+        # Trim stop token.
+        if isinstance(matched, int) and isinstance(output, list):
             assert len(output) > 0
             return output[:-1]
         return output
@@ -124,9 +132,9 @@ class DetokenizerManager:
                     s.decode_ids = recv_obj.decode_ids[i]
                 read_ids.append(
-                    self.trim_eos(
+                    self.trim_matched_stop(
                         s.decode_ids[s.surr_offset :],
-                        recv_obj.finished_reason[i],
+                        recv_obj.finished_reasons[i],
                         recv_obj.no_stop_trim[i],
                     )
                 )
@@ -149,7 +157,7 @@ class DetokenizerManager:
             for i in range(bs):
                 s = self.decode_status[recv_obj.rids[i]]
                 new_text = read_texts[i][len(surr_texts[i]) :]
-                if recv_obj.finished_reason[i] is None:
+                if recv_obj.finished_reasons[i] is None:
                     # Streaming chunk: update the decode status
                     if len(new_text) > 0 and not new_text.endswith("�"):
                         s.decoded_text = s.decoded_text + new_text
@@ -160,9 +168,9 @@ class DetokenizerManager:
                         new_text = find_printable_text(new_text)
                 output_strs.append(
-                    self.trim_eos(
+                    self.trim_matched_stop(
                         s.decoded_text + new_text,
-                        recv_obj.finished_reason[i],
+                        recv_obj.finished_reasons[i],
                         recv_obj.no_stop_trim[i],
                     )
                 )
@@ -170,9 +178,20 @@ class DetokenizerManager:
             self.send_to_tokenizer.send_pyobj(
                 BatchStrOut(
                     rids=recv_obj.rids,
+                    finished_reasons=recv_obj.finished_reasons,
                     output_strs=output_strs,
-                    meta_info=recv_obj.meta_info,
-                    finished_reason=recv_obj.finished_reason,
+                    prompt_tokens=recv_obj.prompt_tokens,
+                    completion_tokens=recv_obj.completion_tokens,
+                    cached_tokens=recv_obj.cached_tokens,
+                    input_token_logprobs_val=recv_obj.input_token_logprobs_val,
+                    input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
+                    output_token_logprobs_val=recv_obj.output_token_logprobs_val,
+                    output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
+                    input_top_logprobs_val=recv_obj.input_top_logprobs_val,
+                    input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
+                    output_top_logprobs_val=recv_obj.output_top_logprobs_val,
+                    output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
+                    normalized_prompt_logprob=recv_obj.normalized_prompt_logprob,
                 )
             )
@@ -194,6 +213,7 @@ def run_detokenizer_process(
     server_args: ServerArgs,
     port_args: PortArgs,
 ):
+    setproctitle.setproctitle("sglang::detokenizer")
     configure_logger(server_args)
     parent_process = psutil.Process().parent()

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -308,6 +308,9 @@ class TokenizedEmbeddingReqInput:
 class BatchTokenIDOut:
     # The request id
     rids: List[str]
+    # The finish reason
+    finished_reasons: List[BaseFinishReason]
+    # For incremental decoding
     # The version id to sync decode status with in detokenizer_manager
     vids: List[int]
     decoded_texts: List[str]
@@ -315,35 +318,61 @@ class BatchTokenIDOut:
     read_offsets: List[int]
     # Only used when `--skip-tokenizer-init`
     output_ids: Optional[List[int]]
+    # Detokenization configs
     skip_special_tokens: List[bool]
     spaces_between_special_tokens: List[bool]
-    meta_info: List[Dict]
-    finished_reason: List[BaseFinishReason]
     no_stop_trim: List[bool]
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+    # Logprobs
+    input_token_logprobs_val: List[float]
+    input_token_logprobs_idx: List[int]
+    output_token_logprobs_val: List[float]
+    output_token_logprobs_idx: List[int]
+    input_top_logprobs_val: List[List]
+    input_top_logprobs_idx: List[List]
+    output_top_logprobs_val: List[List]
+    output_top_logprobs_idx: List[List]
+    normalized_prompt_logprob: List[float]
 @dataclass
 class BatchStrOut:
     # The request id
     rids: List[str]
+    # The finish reason
+    finished_reasons: List[dict]
     # The output decoded strings
     output_strs: List[str]
-    # The meta info
-    meta_info: List[Dict]
-    # The finish reason
-    finished_reason: List[BaseFinishReason]
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+    # Logprobs
+    input_token_logprobs_val: List[float]
+    input_token_logprobs_idx: List[int]
+    output_token_logprobs_val: List[float]
+    output_token_logprobs_idx: List[int]
+    input_top_logprobs_val: List[List]
+    input_top_logprobs_idx: List[List]
+    output_top_logprobs_val: List[List]
+    output_top_logprobs_idx: List[List]
+    normalized_prompt_logprob: List[float]
 @dataclass
 class BatchEmbeddingOut:
     # The request id
     rids: List[str]
+    # The finish reason
+    finished_reasons: List[BaseFinishReason]
     # The output embedding
     embeddings: List[List[float]]
-    # The meta info
-    meta_info: List[Dict]
-    # The finish reason
-    finished_reason: List[BaseFinishReason]
+    # Token counts
+    prompt_tokens: List[int]
 @dataclass

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -58,6 +58,7 @@ global_server_args_dict = {
     "torchao_config": ServerArgs.torchao_config,
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
+    "enable_ep_moe": ServerArgs.enable_ep_moe,
 }
@@ -128,6 +129,7 @@ class ImageInputs:
     image_hashes: Optional[list] = None
     image_sizes: Optional[list] = None
     image_offsets: Optional[list] = None
+    image_pad_len: Optional[list] = None
     pad_values: Optional[list] = None
     modalities: Optional[list] = None
     num_image_tokens: Optional[int] = None
@@ -180,6 +182,7 @@ class ImageInputs:
         optional_args = [
             "image_sizes",
             "image_offsets",
+            "image_pad_len",
             # "modalities", # modalities should be ["multi-images"] (one entry) even for multiple images
             "aspect_ratio_ids",
             "aspect_ratio_mask",
@@ -199,6 +202,9 @@ class Req:
         origin_input_text: str,
         origin_input_ids: Tuple[int],
         sampling_params: SamplingParams,
+        return_logprob: bool = False,
+        top_logprobs_num: int = 0,
+        stream: bool = False,
         origin_input_ids_unpadded: Optional[Tuple[int]] = None,
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
@@ -216,10 +222,11 @@ class Req:
         self.output_ids = []  # Each decode stage's output ids
         self.fill_ids = None  # fill_ids = origin_input_ids + output_ids
         self.session_id = session_id
+        self.input_embeds = input_embeds
+        # Sampling info
         self.sampling_params = sampling_params
         self.lora_path = lora_path
-        self.input_embeds = input_embeds
         # Memory pool info
         self.req_pool_idx = None
@@ -227,8 +234,8 @@ class Req:
         # Check finish
         self.tokenizer = None
         self.finished_reason = None
-        self.stream = False
         self.to_abort = False
+        self.stream = stream
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -240,37 +247,46 @@ class Req:
         # 2: read_offset
         # 3: last token
         self.vid = 0  # version id to sync decode status with in detokenizer_manager
-        self.decoded_text = ""
         self.surr_offset = None  # Surrounding offset to defeat the cleanup algorithm
         self.read_offset = None
-        # The number of decoded tokens for token usage report. Note that
-        # this does not include the jump forward tokens.
-        self.completion_tokens_wo_jump_forward = 0
+        self.decoded_text = ""
         # For multimodal inputs
         self.image_inputs: Optional[ImageInputs] = None
         # Prefix info
         self.prefix_indices = []
+        # Tokens to run prefill. input_tokens - shared_prefix_tokens.
         self.extend_input_len = 0
         self.last_node = None
+        # Chunked prefill
         self.is_being_chunked = 0
         # For retraction
         self.is_retracted = False
         # Logprobs (arguments)
-        self.return_logprob = False
+        self.return_logprob = return_logprob
         self.logprob_start_len = 0
-        self.top_logprobs_num = 0
+        self.top_logprobs_num = top_logprobs_num
         # Logprobs (return value)
         self.normalized_prompt_logprob = None
-        self.input_token_logprobs = None
-        self.input_top_logprobs = None
-        self.output_token_logprobs = []
-        self.output_top_logprobs = []
+        self.input_token_logprobs_val = None
+        self.input_token_logprobs_idx = None
+        self.input_top_logprobs_val = None
+        self.input_top_logprobs_idx = None
+        if return_logprob:
+            self.output_token_logprobs_val = []
+            self.output_token_logprobs_idx = []
+            self.output_top_logprobs_val = []
+            self.output_top_logprobs_idx = []
+        else:
+            self.output_token_logprobs_val = self.output_token_logprobs_idx = (
+                self.output_top_logprobs_val
+            ) = self.output_top_logprobs_idx = None
         # Logprobs (internal values)
         # The tokens is prefilled but need to be considered as decode tokens
@@ -294,13 +310,14 @@ class Req:
         else:
             self.image_inputs.merge(image_inputs)
-    # whether request reached finished condition
     def finished(self) -> bool:
+        # Whether request reached finished condition
         return self.finished_reason is not None
     def init_next_round_input(self, tree_cache: Optional[BasePrefixCache] = None):
         self.fill_ids = self.origin_input_ids + self.output_ids
         if tree_cache is not None:
+            # tree cache is None if the prefix is not computed with tree cache.
             self.prefix_indices, self.last_node = tree_cache.match_prefix(
                 rid=self.rid, key=self.adjust_max_prefix_ids()
             )
@@ -453,8 +470,10 @@ class Req:
                     k = k + 1
                 else:
                     break
-            self.output_token_logprobs = self.output_token_logprobs[:k]
-            self.output_top_logprobs = self.output_top_logprobs[:k]
+            self.output_token_logprobs_val = self.output_token_logprobs_val[:k]
+            self.output_token_logprobs_idx = self.output_token_logprobs_idx[:k]
+            self.output_top_logprobs_val = self.output_top_logprobs_val[:k]
+            self.output_top_logprobs_idx = self.output_top_logprobs_idx[:k]
             self.logprob_start_len = prompt_tokens + k
             self.last_update_decode_tokens = len(self.output_ids) - k
@@ -469,7 +488,7 @@ bid = 0
 @dataclasses.dataclass
 class ScheduleBatch:
-    """Store all inforamtion of a batch on the scheduler."""
+    """Store all information of a batch on the scheduler."""
     # Request, memory pool, and cache
     reqs: List[Req]
@@ -1067,9 +1086,9 @@ class ScheduleBatch:
             self.top_logprobs_nums = [0] * len(self.reqs) + other.top_logprobs_nums
         self.reqs.extend(other.reqs)
-        self.return_logprob = self.return_logprob or other.return_logprob
-        self.has_stream = self.has_stream or other.has_stream
-        self.has_grammar = self.has_grammar or other.has_grammar
+        self.return_logprob |= other.return_logprob
+        self.has_stream |= other.has_stream
+        self.has_grammar |= other.has_grammar
     def get_model_worker_batch(self):
         if self.forward_mode.is_decode() or self.forward_mode.is_idle():
@@ -1096,7 +1115,6 @@ class ScheduleBatch:
             seq_lens=self.seq_lens,
             out_cache_loc=self.out_cache_loc,
             seq_lens_sum=self.seq_lens_sum,
-            req_to_token_pool_records=self.req_to_token_pool.get_write_records(),
             return_logprob=self.return_logprob,
             top_logprobs_nums=self.top_logprobs_nums,
             global_num_tokens=self.global_num_tokens,
@@ -1151,9 +1169,6 @@ class ModelWorkerBatch:
     # The sum of all sequence lengths
     seq_lens_sum: int
-    # The memory pool operation records
-    req_to_token_pool_records: Optional[List[Tuple[Tuple, torch.Tensor]]]
     # For logprob
     return_logprob: bool
     top_logprobs_nums: Optional[List[int]]

sglang 0.4.0__py3-none-any.whl → 0.4.0.post2__py3-none-any.whl

sglang 0.4.0py3-none-any.whl → 0.4.0.post2py3-none-any.whl