PyPI - sglang - Versions diffs - 0.4.1.post5__py3-none-any.whl → 0.4.1.post6__py3-none-any.whl - Mend

sglang 0.4.1.post5py3-none-any.whl → 0.4.1.post6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

sglang/srt/configs/model_config.py +15 -6
sglang/srt/layers/attention/flashinfer_backend.py +17 -3
sglang/srt/layers/linear.py +36 -98
sglang/srt/layers/moe/fused_moe_triton/layer.py +37 -9
sglang/srt/layers/moe/topk.py +4 -2
sglang/srt/layers/parameter.py +24 -16
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/fp8.py +106 -52
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/int8_kernel.py +54 -0
sglang/srt/layers/quantization/modelopt_quant.py +1 -1
sglang/srt/layers/quantization/w8a8_int8.py +117 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/vocab_parallel_embedding.py +15 -2
sglang/srt/managers/configure_logging.py +43 -0
sglang/srt/managers/detokenizer_manager.py +0 -2
sglang/srt/managers/io_struct.py +29 -13
sglang/srt/managers/scheduler.py +48 -9
sglang/srt/managers/tokenizer_manager.py +109 -49
sglang/srt/mem_cache/memory_pool.py +107 -52
sglang/srt/metrics/collector.py +10 -5
sglang/srt/model_executor/model_runner.py +43 -6
sglang/srt/models/llama.py +37 -2
sglang/srt/models/qwen2.py +11 -0
sglang/srt/models/qwen2_eagle.py +131 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
sglang/srt/sampling/sampling_batch_info.py +14 -5
sglang/srt/sampling/sampling_params.py +1 -1
sglang/srt/server.py +114 -61
sglang/srt/server_args.py +27 -18
sglang/srt/speculative/eagle_worker.py +1 -0
sglang/srt/torch_memory_saver_adapter.py +59 -0
sglang/srt/utils.py +29 -0
sglang/version.py +1 -1
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/METADATA +12 -10
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/RECORD +39 -34
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post6.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
 import logging
-import os
 from typing import Any, Callable, Dict, List, Optional
 import torch
@@ -40,12 +39,15 @@ from sglang.srt.layers.quantization.fp8_utils import (
 from sglang.srt.utils import (
     get_bool_env_var,
     is_hip,
+    permute_weight,
     print_warning_once,
     set_weight_attrs,
 )
 ACTIVATION_SCHEMES = ["static", "dynamic"]
+is_hip_ = is_hip()
 logger = logging.getLogger(__name__)
@@ -161,7 +163,7 @@ class Fp8LinearMethod(LinearMethodBase):
         # kernel for fast weight-only FP8 quantization
         self.use_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN")
         # Disable marlin for ROCm
-        if is_hip():
+        if is_hip_:
             self.use_marlin = False
         self.block_quant = self.quant_config.weight_block_size is not None
@@ -273,7 +275,7 @@ class Fp8LinearMethod(LinearMethodBase):
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # activation_scheme: dynamic
                 weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.weight,
@@ -330,7 +332,7 @@ class Fp8LinearMethod(LinearMethodBase):
                 weight_scale = layer.weight_scale
                 # If ROCm, normalize the weights and scales to e4m3fnuz
-                if is_hip():
+                if is_hip_:
                     weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
                         weight_scale=weight_scale,
@@ -567,7 +569,7 @@ class Fp8MoEMethod:
         # Block quant doesn't need to process weights after loading
         if self.block_quant:
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # activation_scheme: dynamic
                 w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
                     weight=layer.w13_weight,
@@ -594,7 +596,7 @@ class Fp8MoEMethod:
         # If checkpoint is fp16 or bfloat16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             # If ROCm, use float8_e4m3fnuz instead (MI300x HW)
-            fp8_dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
+            fp8_dtype = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
             w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
             w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
@@ -616,18 +618,30 @@ class Fp8MoEMethod:
             layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
-            # If ROCm, apply weight padding (min. Mem channel contention) only if set
-            if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))):
-                layer.w13_weight = torch.nn.Parameter(
-                    F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
-                layer.w2_weight = torch.nn.Parameter(
-                    F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
+            if is_hip_:
+                if get_bool_env_var("CK_MOE"):
+                    layer.w13_weight = torch.nn.Parameter(
+                        permute_weight(layer.w13_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        permute_weight(layer.w2_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                elif get_bool_env_var("MOE_PADDING"):
+                    # If ROCm, apply weight padding (min. Mem channel contention) only if set
+                    layer.w13_weight = torch.nn.Parameter(
+                        F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
             return
         # If checkpoint is fp8, we need to handle that the
@@ -658,7 +672,7 @@ class Fp8MoEMethod:
                 )
             # If ROCm, normalize the weights and scales to e4m3fnuz
-            if is_hip():
+            if is_hip_:
                 # Normalize the weights and scales
                 w13_weight, w13_weight_scale, w13_input_scale = (
                     normalize_e4m3fn_to_e4m3fnuz(
@@ -708,18 +722,30 @@ class Fp8MoEMethod:
                 max_w13_scales, requires_grad=False
             )
-            # If ROCm, apply weight padding (min. Mem channel contention) only if set
-            if is_hip() and bool(int(os.getenv("MOE_PADDING", "0"))):
-                layer.w13_weight = torch.nn.Parameter(
-                    F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
-                layer.w2_weight = torch.nn.Parameter(
-                    F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
-                    requires_grad=False,
-                )
-                torch.cuda.empty_cache()
+            if is_hip_:
+                if get_bool_env_var("CK_MOE"):
+                    layer.w13_weight = torch.nn.Parameter(
+                        permute_weight(layer.w13_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        permute_weight(layer.w2_weight.data),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                elif get_bool_env_var("MOE_PADDING"):
+                    # If ROCm, apply weight padding (min. Mem channel contention) only if set
+                    layer.w13_weight = torch.nn.Parameter(
+                        F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
+                    layer.w2_weight = torch.nn.Parameter(
+                        F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                        requires_grad=False,
+                    )
+                    torch.cuda.empty_cache()
             return
     def apply(
@@ -752,27 +778,55 @@ class Fp8MoEMethod:
             correction_bias=correction_bias,
         )
-        # Expert fusion with FP8 quantization
-        return fused_experts(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            inplace=True,
-            use_fp8_w8a8=True,
-            w1_scale=(
-                layer.w13_weight_scale_inv
-                if self.block_quant
-                else layer.w13_weight_scale
-            ),
-            w2_scale=(
-                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
-            ),
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
-            block_shape=self.quant_config.weight_block_size,
-        )
+        if is_hip_ and get_bool_env_var("CK_MOE"):
+            import ater
+            from ater.fused_moe import fused_experts_ck
+            return fused_experts_ck(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                use_fp8_w8a8=True,
+                w1_scale=(
+                    layer.w13_weight_scale_inv
+                    if self.block_quant
+                    else layer.w13_weight_scale
+                ),
+                w2_scale=(
+                    layer.w2_weight_scale_inv
+                    if self.block_quant
+                    else layer.w2_weight_scale
+                ),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+            )
+        else:
+            # Expert fusion with FP8 quantization
+            return fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                use_fp8_w8a8=True,
+                w1_scale=(
+                    layer.w13_weight_scale_inv
+                    if self.block_quant
+                    else layer.w13_weight_scale
+                ),
+                w2_scale=(
+                    layer.w2_weight_scale_inv
+                    if self.block_quant
+                    else layer.w2_weight_scale
+                ),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+                block_shape=self.quant_config.weight_block_size,
+            )
 class Fp8KVCacheMethod(BaseKVCacheMethod):

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from typing import List, Optional, Tuple
 import torch
-from vllm.model_executor.parameter import RowvLLMParameter, _ColumnvLLMParameter
+from sglang.srt.layers.parameter import RowvLLMParameter, _ColumnvLLMParameter
 from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
     w8a8_block_fp8_matmul,

sglang/srt/layers/quantization/int8_kernel.py ADDED Viewed

@@ -0,0 +1,54 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = x * (127 / absmax)
+    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x)
+def per_token_quant_int8(x):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32)
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M,)](
+        x,
+        x_q,
+        scales,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return x_q, scales

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -11,9 +11,9 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     cutlass_fp8_supported,
     requantize_with_max_scale,
 )
-from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,

sglang/srt/layers/quantization/w8a8_int8.py ADDED Viewed

@@ -0,0 +1,117 @@
+from typing import Any, Dict, List, Optional
+import torch
+from sglang.srt.utils import is_cuda_available
+is_cuda = is_cuda_available()
+if is_cuda:
+    from sgl_kernel import int8_scaled_mm
+from torch.nn.parameter import Parameter
+from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+class W8A8Int8Config(QuantizationConfig):
+    """Config class for W8A8 Int8 Quantization.
+    - Weight: static, per-channel, symmetric
+    - Activation: dynamic, per-token, symmetric
+    """
+    def __init__(self):
+        pass
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+    @classmethod
+    def get_name(self) -> str:
+        return "w8a8_int8"
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "W8A8Int8Config":
+        return cls()
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        from vllm.model_executor.layers.linear import LinearBase
+        if isinstance(layer, LinearBase):
+            return W8A8Int8LinearMethod(self)
+        return None
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+class W8A8Int8LinearMethod(LinearMethodBase):
+    def __init__(self, quantization_config: W8A8Int8Config):
+        self.quantization_config = quantization_config
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.weight = Parameter(layer.weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs
+    ):
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        self.logical_widths = output_partition_sizes
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes), input_size_per_partition, dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        x_q, x_scale = per_token_quant_int8(x)
+        return int8_scaled_mm(
+            x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
+        )

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -47,6 +47,8 @@ class RadixAttention(nn.Module):
         self.logit_cap = logit_cap
         self.sliding_window_size = sliding_window_size or -1
         self.is_cross_attention = is_cross_attention
+        self.k_scale = 1.0
+        self.v_scale = 1.0
     def forward(
         self,

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -220,6 +220,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         enable_tp: bool = True,
+        use_presharded_weights: bool = False,
     ):
         super().__init__()
         self.quant_config = quant_config
@@ -236,6 +237,12 @@ class VocabParallelEmbedding(torch.nn.Module):
         self.padding_size = padding_size
         self.org_vocab_size = org_num_embeddings or num_embeddings
         num_added_embeddings = num_embeddings - self.org_vocab_size
+        self.use_presharded_weights = use_presharded_weights
+        if use_presharded_weights:
+            assert (
+                num_added_embeddings == 0
+            ), "Lora is not supported with presharded weights."
         self.org_vocab_size_padded = pad_vocab_size(
             self.org_vocab_size, self.padding_size
         )
@@ -447,10 +454,14 @@ class VocabParallelEmbedding(torch.nn.Module):
             start_idx = start_idx // packed_factor
             shard_size = shard_size // packed_factor
         else:
-            assert loaded_weight.shape[output_dim] == self.org_vocab_size
+            assert loaded_weight.shape[output_dim] == (
+                self.org_vocab_size
+                // (self.tp_size if self.use_presharded_weights else 1)
+            )
         # Copy the data.
-        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+        if not self.use_presharded_weights:
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         param[: loaded_weight.shape[0]].data.copy_(loaded_weight)
         param[loaded_weight.shape[0] :].data.fill_(0)
@@ -514,6 +525,7 @@ class ParallelLMHead(VocabParallelEmbedding):
         padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_presharded_weights: bool = False,
     ):
         super().__init__(
             num_embeddings,
@@ -523,6 +535,7 @@ class ParallelLMHead(VocabParallelEmbedding):
             padding_size,
             quant_config,
             prefix,
+            use_presharded_weights=use_presharded_weights,
         )
         self.quant_config = quant_config
         if bias:

sglang/srt/managers/configure_logging.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""
+Configure the logging settings of a server.
+Usage:
+python3 -m sglang.srt.managers.configure_logging --url http://localhost:30000
+"""
+import argparse
+import requests
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    parser.add_argument(
+        "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
+    )
+    parser.add_argument("--dump-requests-threshold", type=int, default=1000)
+    args = parser.parse_args()
+    response = requests.post(
+        args.url + "/configure_logging",
+        json={
+            "dump_requests_folder": args.dump_requests_folder,
+            "dump_requests_threshold": args.dump_requests_threshold,
+        },
+    )
+    assert response.status_code == 200

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -181,8 +181,6 @@ class DetokenizerManager:
                     finished_reasons=recv_obj.finished_reasons,
                     output_strs=output_strs,
                     prompt_tokens=recv_obj.prompt_tokens,
-                    origin_input_ids=recv_obj.origin_input_ids,
-                    output_ids=recv_obj.output_ids,
                     completion_tokens=recv_obj.completion_tokens,
                     cached_tokens=recv_obj.cached_tokens,
                     input_token_logprobs_val=recv_obj.input_token_logprobs_val,

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -19,9 +19,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
 import uuid
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Tuple, Union
-import torch
+from typing import Dict, List, Optional, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -323,9 +321,7 @@ class BatchTokenIDOut:
     decoded_texts: List[str]
     decode_ids: List[int]
     read_offsets: List[int]
-    # Only used when --return-token-ids` is set
-    origin_input_ids: Optional[List[int]]
-    # Only used when `--skip-tokenizer-init` or `--return-token-ids` is set
+    # Only used when `--skip-tokenizer-init` is on
     output_ids: Optional[List[int]]
     # Detokenization configs
     skip_special_tokens: List[bool]
@@ -356,14 +352,7 @@ class BatchStrOut:
     # The output decoded strings
     output_strs: List[str]
-    # The token ids
-    origin_input_ids: Optional[List[int]]
-    output_ids: Optional[List[int]]
     # Token counts
-    # real input and output tokens can be get from
-    # origin_input_ids and output_ids by enabling --return_token_ids
-    # TODO (Shuai): Rename this to clarify the meaning.
     prompt_tokens: List[int]
     completion_tokens: List[int]
     cached_tokens: List[int]
@@ -468,6 +457,26 @@ class GetWeightsByNameReqOutput:
     parameter: list
+@dataclass
+class ReleaseMemoryOccupationReqInput:
+    pass
+@dataclass
+class ReleaseMemoryOccupationReqOutput:
+    pass
+@dataclass
+class ResumeMemoryOccupationReqInput:
+    pass
+@dataclass
+class ResumeMemoryOccupationReqOutput:
+    pass
 @dataclass
 class AbortReq:
     # The request id
@@ -479,6 +488,13 @@ class ProfileReq(Enum):
     STOP_PROFILE = 2
+@dataclass
+class ConfigureLoggingReq:
+    log_requests: Optional[bool] = None
+    dump_requests_folder: Optional[str] = None
+    dump_requests_threshold: Optional[int] = None
 @dataclass
 class OpenSessionReqInput:
     capacity_of_str_len: int

sglang 0.4.1.post5__py3-none-any.whl → 0.4.1.post6__py3-none-any.whl

sglang 0.4.1.post5py3-none-any.whl → 0.4.1.post6py3-none-any.whl