PyPI - sglang - Versions diffs - 0.4.1.post3__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl - Mend

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -17,15 +17,21 @@ from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
 from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
-not_hip = False
+is_hip_flag = False
 if not is_hip():
     from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
-    not_hip = True
+    is_hip_flag = False
+else:
+    is_hip_flag = True
 logger = logging.getLogger(__name__)
 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
+enable_moe_align_block_size_triton = bool(
+    int(os.getenv("ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
+)
 @triton.jit
 def fused_moe_kernel(
@@ -222,6 +228,139 @@ def fused_moe_kernel(
     tl.store(c_ptrs, accumulator, mask=c_mask)
+def ceil_div(a, b):
+    return (a + b - 1) // b
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = pid * tokens_per_thread
+    off_c = (pid + 1) * num_experts
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts,)
+    tokens_cnts = torch.zeros(
+        (num_experts + 1, num_experts), dtype=torch.int32, device=topk_ids.device
+    )
+    cumsum = torch.zeros((num_experts + 1,), dtype=torch.int32, device=topk_ids.device)
+    tokens_per_thread = ceil_div(numel, num_experts)
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1,)](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
 def moe_align_block_size(
     topk_ids: torch.Tensor, block_size: int, num_experts: int
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -272,24 +411,36 @@ def moe_align_block_size(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-    if not_hip and num_experts >= 224:
-        token_cnts_buffer = torch.empty(
-            (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
-        )
-        cumsum_buffer = torch.empty(
-            num_experts + 1, dtype=torch.int32, device=topk_ids.device
-        )
+    if num_experts >= 224:
+        if enable_moe_align_block_size_triton or is_hip_flag:
+            moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+        else:
+            token_cnts_buffer = torch.empty(
+                (num_experts + 1) * num_experts,
+                dtype=torch.int32,
+                device=topk_ids.device,
+            )
+            cumsum_buffer = torch.empty(
+                num_experts + 1, dtype=torch.int32, device=topk_ids.device
+            )
-        sgl_moe_align_block_size(
-            topk_ids,
-            num_experts,
-            block_size,
-            sorted_ids,
-            expert_ids,
-            num_tokens_post_pad,
-            token_cnts_buffer,
-            cumsum_buffer,
-        )
+            sgl_moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+                token_cnts_buffer,
+                cumsum_buffer,
+            )
     else:
         ops.moe_align_block_size(
             topk_ids,
@@ -326,9 +477,9 @@ def invoke_fused_moe_kernel(
     padded_size = 0
     if use_fp8_w8a8:
-        padded_size = padding_size
         assert B_scale is not None
         if block_shape is None:
+            padded_size = padding_size
             A, A_scale = ops.scaled_fp8_quant(A, A_scale)
         else:
             assert len(block_shape) == 2
@@ -463,7 +614,7 @@ def get_default_config(
                 "BLOCK_SIZE_K": 128,
                 "GROUP_SIZE_M": 32,
                 "num_warps": 8,
-                "num_stages": 4,
+                "num_stages": 2 if is_hip_flag else 4,
             }
             if M <= E:
                 config = {
@@ -472,7 +623,7 @@ def get_default_config(
                     "BLOCK_SIZE_K": 128,
                     "GROUP_SIZE_M": 1,
                     "num_warps": 4,
-                    "num_stages": 4,
+                    "num_stages": 2 if is_hip_flag else 4,
                 }
         else:
             # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
@@ -482,7 +633,7 @@ def get_default_config(
                 "BLOCK_SIZE_K": block_shape[1],
                 "GROUP_SIZE_M": 32,
                 "num_warps": 4,
-                "num_stages": 3,
+                "num_stages": 2 if is_hip_flag else 3,
             }
     else:
         config = {
@@ -727,7 +878,7 @@ def fused_experts_impl(
     block_shape: Optional[List[int]] = None,
 ):
     padded_size = padding_size
-    if not use_fp8_w8a8:
+    if not use_fp8_w8a8 or block_shape is not None:
         padded_size = 0
     # Check constraints.
@@ -854,11 +1005,18 @@ def fused_experts_impl(
             block_shape=block_shape,
         )
-        torch.sum(
-            intermediate_cache3.view(*intermediate_cache3.shape),
-            dim=1,
-            out=out_hidden_states[begin_chunk_idx:end_chunk_idx],
-        )
+        if is_hip_flag:
+            ops.moe_sum(
+                intermediate_cache3.view(*intermediate_cache3.shape),
+                out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            )
+        else:
+            torch.sum(
+                intermediate_cache3.view(*intermediate_cache3.shape),
+                dim=1,
+                out=out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            )
     return out_hidden_states

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -321,9 +321,12 @@ class FusedMoE(torch.nn.Module):
         # Index the loaded weight for tp sharding.
         # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
         shard_size = expert_data.shape[shard_dim] // 2
-        loaded_weight = loaded_weight.narrow(
-            shard_dim, shard_size * tp_rank, shard_size
-        )
+        if not self.use_presharded_weights:
+            loaded_weight = loaded_weight.narrow(
+                shard_dim, shard_size * tp_rank, shard_size
+            )
         # Narrow parameter and load.
         # w1, gate_proj: Load into first logical weight of w13.
         if shard_id == "w1":
@@ -347,9 +350,12 @@ class FusedMoE(torch.nn.Module):
         # down_proj: "RowParallel" so tp sharding on input_dim
         # Narrow parameter and load.
         shard_size = expert_data.shape[shard_dim]
-        loaded_weight = loaded_weight.narrow(
-            shard_dim, shard_size * tp_rank, shard_size
-        )
+        if not self.use_presharded_weights:
+            loaded_weight = loaded_weight.narrow(
+                shard_dim, shard_size * tp_rank, shard_size
+            )
         # w2, down_proj: Load into only logical weight of w2.
         expert_data.copy_(loaded_weight)
@@ -389,7 +395,9 @@ class FusedMoE(torch.nn.Module):
         weight_name: str,
         shard_id: str,
         expert_id: int,
+        use_presharded_weights: bool = False,
     ) -> None:
+        self.use_presharded_weights = use_presharded_weights
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -280,9 +280,9 @@ class Fp8LinearMethod(LinearMethodBase):
                     weight_scale=layer.weight_scale_inv,
                     input_scale=None,
                 )
-                layer.weight = torch.nn.Parameter(weight, require_grad=False)
+                layer.weight = torch.nn.Parameter(weight, requires_grad=False)
                 layer.weight_scale_inv = torch.nn.Parameter(
-                    weight_scale, require_grad=False
+                    weight_scale, requires_grad=False
                 )
                 layer.input_scale = None
             return

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Union
+from typing import List
 import torch
 from torch import nn
@@ -28,13 +28,12 @@ class Sampler(nn.Module):
     def forward(
         self,
-        logits: Union[torch.Tensor, LogitsProcessorOutput],
+        logits_output: LogitsProcessorOutput,
         sampling_info: SamplingBatchInfo,
+        return_logprob: bool,
+        top_logprobs_nums: List[int],
     ):
-        if isinstance(logits, LogitsProcessorOutput):
-            logits = logits.next_token_logits
-        logits = logits.contiguous()
+        logits = logits_output.next_token_logits
         if self.use_nan_detectioin and torch.any(torch.isnan(logits)):
             logger.warning("Detected errors during sampling! NaN in the logits.")
@@ -47,6 +46,8 @@ class Sampler(nn.Module):
         if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling
             batch_next_token_ids = torch.argmax(logits, -1)
+            if return_logprob:
+                logprobs = torch.nn.functional.log_softmax(logits, dim=-1)
         else:
             # Post process logits
             logits.div_(sampling_info.temperatures)
@@ -54,6 +55,14 @@ class Sampler(nn.Module):
             del logits
             if global_server_args_dict["sampling_backend"] == "flashinfer":
+                if return_logprob:
+                    # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
+                    # https://github.com/flashinfer-ai/flashinfer/issues/708
+                    # so we use the torch implementation.
+                    logprobs = torch.log(
+                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
+                    )
                 max_top_k_round, batch_size = 32, probs.shape[0]
                 uniform_samples = torch.rand(
                     (max_top_k_round, batch_size), device=probs.device
@@ -76,6 +85,7 @@ class Sampler(nn.Module):
                 if self.use_nan_detectioin and not torch.all(success):
                     logger.warning("Detected errors during sampling!")
                     batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
             elif global_server_args_dict["sampling_backend"] == "pytorch":
                 # A slower fallback implementation with torch native operations.
                 batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
@@ -85,12 +95,31 @@ class Sampler(nn.Module):
                     sampling_info.min_ps,
                     sampling_info.need_min_p_sampling,
                 )
+                if return_logprob:
+                    logprobs = torch.log(
+                        top_p_normalize_probs_torch(probs, sampling_info.top_ps)
+                    )
             else:
                 raise ValueError(
                     f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                 )
-        return batch_next_token_ids.to(torch.int32)
+        batch_next_token_ids = batch_next_token_ids.to(torch.int32)
+        # Attach logprobs to logits_output (in-place modification)
+        if return_logprob:
+            if any(x > 0 for x in top_logprobs_nums):
+                (
+                    logits_output.next_token_top_logprobs_val,
+                    logits_output.next_token_top_logprobs_idx,
+                ) = get_top_logprobs(logprobs, top_logprobs_nums)
+            logits_output.next_token_logprobs = logprobs[
+                torch.arange(len(batch_next_token_ids), device=sampling_info.device),
+                batch_next_token_ids,
+            ]
+        return batch_next_token_ids
 def top_k_top_p_min_p_sampling_from_probs_torch(
@@ -120,20 +149,27 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     return batch_next_token_ids
-def top_p_normalize_probs(
+def top_p_normalize_probs_torch(
     probs: torch.Tensor,
     top_ps: torch.Tensor,
 ):
-    if global_server_args_dict["sampling_backend"] == "flashinfer":
-        return top_p_renorm_prob(probs, top_ps)
-    elif global_server_args_dict["sampling_backend"] == "pytorch":
-        # See also top_k_top_p_min_p_sampling_from_probs_torch
-        probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
-        probs_sum = torch.cumsum(probs_sort, dim=-1)
-        probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
-        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-        return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
-    else:
-        raise ValueError(
-            f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
-        )
+    # See also top_k_top_p_min_p_sampling_from_probs_torch
+    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
+def get_top_logprobs(logprobs: torch.Tensor, top_logprobs_nums: List[int]):
+    max_k = max(top_logprobs_nums)
+    ret = logprobs.topk(max_k, dim=1)
+    values = ret.values.tolist()
+    indices = ret.indices.tolist()
+    output_top_logprobs_val = []
+    output_top_logprobs_idx = []
+    for i, k in enumerate(top_logprobs_nums):
+        output_top_logprobs_val.append(values[i][:k])
+        output_top_logprobs_idx.append(indices[i][:k])
+    return output_top_logprobs_val, output_top_logprobs_idx

sglang/srt/layers/torchao_utils.py CHANGED Viewed

@@ -11,6 +11,22 @@ import torch
 logger = logging.getLogger(__name__)
+def get_gemlite_cache_path() -> str:
+    return f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
+def save_gemlite_cache(print_error: bool = False) -> bool:
+    try:
+        from gemlite.core import GemLiteLinearTriton
+        GemLiteLinearTriton.cache_config(get_gemlite_cache_path())
+    except Exception:
+        if print_error:
+            logger.error("Failed to save the GemLite cache.")
+        return False
+    return True
 def apply_torchao_config_to_model(
     model: torch.nn.Module, torchao_config: str, filter_fn=None
 ):
@@ -74,9 +90,7 @@ def apply_torchao_config_to_model(
         )
         # try to load gemlite kernel config
-        GemLiteLinearTriton.load_config(
-            f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
-        )
+        GemLiteLinearTriton.load_config(get_gemlite_cache_path())
     elif "fp8wo" in torchao_config:
         # this requires newer hardware

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -426,8 +426,7 @@ class UpdateWeightsFromDistributedReqOutput:
 @dataclass
 class UpdateWeightsFromTensorReqInput:
-    name: str
-    tensor: torch.Tensor
+    serialized_named_tensors: bytes  # indeed Dict[str, torch.Tensor]
 @dataclass

sglang 0.4.1.post3__py3-none-any.whl → 0.4.1.post4__py3-none-any.whl

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post4py3-none-any.whl