PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/utils.py CHANGED Viewed

@@ -17,6 +17,7 @@ import base64
 import builtins
 import ctypes
 import dataclasses
+import functools
 import importlib
 import io
 import ipaddress
@@ -25,6 +26,7 @@ import json
 import logging
 import os
 import pickle
+import platform
 import random
 import re
 import resource
@@ -44,6 +46,7 @@ from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from importlib.util import find_spec
 from io import BytesIO
+from json import JSONDecodeError
 from multiprocessing.reduction import ForkingPickler
 from pathlib import Path
 from typing import (
@@ -157,6 +160,15 @@ def is_npu() -> bool:
     return hasattr(torch, "npu") and torch.npu.is_available()
+def is_cpu() -> bool:
+    machine = platform.machine().lower()
+    return (
+        machine in ("x86_64", "amd64", "i386", "i686")
+        and hasattr(torch, "cpu")
+        and torch.cpu.is_available()
+    )
 def is_flashinfer_available():
     """
     Check whether flashinfer is available.
@@ -826,6 +838,7 @@ class CustomCacheManager(FileCacheManager):
 def set_ulimit(target_soft_limit=65535):
+    # number of open files
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
@@ -835,6 +848,18 @@ def set_ulimit(target_soft_limit=65535):
         except ValueError as e:
             logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
+    # stack size
+    resource_type = resource.RLIMIT_STACK
+    current_soft, current_hard = resource.getrlimit(resource_type)
+    target_soft_limit_stack_size = 1024 * target_soft_limit
+    if current_soft < target_soft_limit_stack_size:
+        try:
+            resource.setrlimit(
+                resource_type, (target_soft_limit_stack_size, current_hard)
+            )
+        except ValueError as e:
+            logger.warning(f"Fail to set RLIMIT_STACK: {e}")
 def add_api_key_middleware(app, api_key: str):
     @app.middleware("http")
@@ -1362,6 +1387,11 @@ def print_warning_once(msg: str) -> None:
     logger.warning(msg, stacklevel=2)
+@functools.lru_cache(None)
+def print_info_once(msg: str) -> None:
+    logger.info(msg)
 def get_device_name(device_id: int = 0) -> str:
     if hasattr(torch, "cuda") and torch.cuda.is_available():
         return torch.cuda.get_device_name(device_id)
@@ -1917,16 +1947,18 @@ def next_power_of_2(n: int):
 setattr(triton, "next_power_of_2", next_power_of_2)
-@contextmanager
-def empty_context(*args, **kwargs):
-    try:
-        # Setup code goes here
-        yield
-    finally:
-        # Cleanup code goes here
+class EmptyContextManager:
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
         pass
+def empty_context(*args, **kwargs):
+    return EmptyContextManager()
 def add_prefix(name: str, prefix: str) -> str:
     """Add a weight path prefix to a module name.
@@ -2025,6 +2057,14 @@ class DeepEPMode(Enum):
             return DeepEPMode.normal
+def is_non_idle_and_non_empty(forward_mode, hidden_states):
+    return (
+        (forward_mode is not None)
+        and not forward_mode.is_idle()
+        and hidden_states.shape[0] > 0
+    )
 def fast_topk(values, topk, dim):
     if topk == 1:
         # Use max along the specified dimension to get both value and index
@@ -2046,6 +2086,12 @@ is_ampere_with_cuda_12_3 = lambda: _check(8)
 is_hopper_with_cuda_12_3 = lambda: _check(9)
+def is_blackwell():
+    if not is_cuda():
+        return False
+    return torch.cuda.get_device_capability()[0] == 10
 def get_free_port():
     # try ipv4
     try:
@@ -2068,6 +2114,14 @@ def get_local_ip_by_remote() -> str:
     except Exception:
         pass
+    try:
+        hostname = socket.gethostname()
+        ip = socket.gethostbyname(hostname)
+        if ip and ip != "127.0.0.1" and ip != "0.0.0.0":
+            return ip
+    except Exception:
+        pass
     # try ipv6
     try:
         s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
@@ -2160,3 +2214,129 @@ class Withable(Generic[T]):
         finally:
             assert self._value is new_value
             self._value = None
+def merge_bias_tensor(
+    lhs: Optional[torch.Tensor],
+    rhs: Optional[torch.Tensor],
+    bs1: int,
+    bs2: int,
+    device: str,
+    default: float,
+):
+    """Merge two bias tensors for batch merging.
+    Args:
+        lhs: Left-hand side tensor
+        rhs: Right-hand side tensor
+        bs1: Batch size of left-hand side tensor
+        bs2: Batch size of right-hand side tensor
+        device: Device to place the merged tensor on
+        default: Default value for missing tensor elements
+    Returns:
+        Merged tensor or None if both inputs are None
+    """
+    if lhs is None and rhs is None:
+        return None
+    if lhs is not None and rhs is not None:
+        return torch.cat([lhs, rhs])
+    else:
+        if lhs is not None:
+            shape, dtype = lhs.shape[1:], lhs.dtype
+        else:
+            shape, dtype = rhs.shape[1:], rhs.dtype
+        if lhs is None:
+            lhs = torch.empty((bs1, *shape), device=device, dtype=dtype).fill_(default)
+        if rhs is None:
+            rhs = torch.empty((bs2, *shape), device=device, dtype=dtype).fill_(default)
+        return torch.cat([lhs, rhs])
+def find_local_repo_dir(repo_id: str, revision: Optional[str] = None) -> Optional[str]:
+    import huggingface_hub as hf
+    # Build cache path
+    cache_path = os.path.join(
+        hf.constants.HF_HUB_CACHE,
+        hf.constants.REPO_ID_SEPARATOR.join(["models", *repo_id.split("/")]),
+    )
+    # Get revision from main ref if not specified
+    if not revision:
+        ref_path = os.path.join(cache_path, "refs", "main")
+        if os.path.isfile(ref_path):
+            with open(ref_path) as f:
+                revision = f.read().strip()
+    # List files from revision directory
+    if revision:
+        rev_dir = os.path.join(cache_path, "snapshots", revision)
+        if os.path.isdir(rev_dir):
+            return rev_dir
+    return None
+def read_system_prompt_from_file(model_name: str) -> str:
+    """Read system prompt from a file in the HuggingFace cache directory.
+    Args:
+        model_name: The model name to construct the file path
+    Returns:
+        The system prompt content from the file, or empty string if file not found
+    """
+    try:
+        local_repo_dir = find_local_repo_dir(model_name)
+        if local_repo_dir:
+            system_prompt_file = os.path.join(local_repo_dir, "SYSTEM_PROMPT.txt")
+            if os.path.exists(system_prompt_file):
+                with open(system_prompt_file, "r", encoding="utf-8") as f:
+                    return f.read()
+        return ""
+    except Exception:
+        # If anything fails, return empty string
+        return ""
+def bind_or_assign(target, source):
+    if target is not None:
+        target.copy_(source)
+        return target
+    else:
+        return source
+def support_triton(backend: str) -> bool:
+    return backend not in ["torch_native", "intel_amx"]
+try:
+    import sgl_kernel
+    is_intel_amx_backend_available = hasattr(
+        torch.ops.sgl_kernel, "convert_weight_packed"
+    )
+except:
+    is_intel_amx_backend_available = False
+def cpu_has_amx_support():
+    return torch._C._cpu._is_amx_tile_supported() and is_intel_amx_backend_available
+class LazyValue:
+    def __init__(self, creator: Callable):
+        self._creator = creator
+        self._value = None
+    @property
+    def value(self):
+        if self._creator is not None:
+            self._value = self._creator()
+            self._creator = None
+        return self._value

sglang/test/attention/test_prefix_chunk_info.py CHANGED Viewed

@@ -2,6 +2,8 @@ import unittest
 import torch
+from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
+from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.test.test_utils import CustomTestCase

sglang/test/runners.py CHANGED Viewed

@@ -26,6 +26,7 @@ from transformers import (
     AutoModelForCausalLM,
     AutoModelForVision2Seq,
     AutoProcessor,
+    GenerationConfig,
 )
 from sglang.srt.entrypoints.engine import Engine
@@ -41,6 +42,21 @@ DEFAULT_PROMPTS = [
     # the output of gemma-2-2b from SRT is unstable on the commented prompt
     # "The capital of France is",
 ]
+TEST_RERANK_QUERY_DOCS = [
+    {
+        "query": "How many people live in Berlin?",
+        "documents": [
+            "Berlin is well known for its museums.",
+        ],
+    },
+    {
+        "query": "How many people live in Berlin?",
+        "documents": [
+            "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.",
+            "Berlin is well known for its museums.",
+        ],
+    },
+]
 dirpath = os.path.dirname(__file__)
 with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
@@ -240,7 +256,7 @@ class HFRunner:
                 self.model = _get_sentence_transformer_embedding_model(
                     model_path, torch_dtype
                 )
-        elif self.model_type == "reward":
+        elif self.model_type == "reward" or self.model_type == "cross_encoder":
             from transformers import AutoModelForSequenceClassification
             self.model = AutoModelForSequenceClassification.from_pretrained(
@@ -302,6 +318,15 @@ class HFRunner:
                     else:
                         logits = self.model.encode(prompts).tolist()
                     out_queue.put(ModelOutput(embed_logits=logits))
+                elif self.model_type == "cross_encoder":
+                    inputs = self.tokenizer(
+                        prompts, padding=True, return_tensors="pt"
+                    ).to("cuda")
+                    scores = self.model(**inputs).logits
+                    scores = scores.squeeze().tolist()
+                    if not isinstance(scores, list):
+                        scores = [scores]
+                    out_queue.put(ModelOutput(scores=scores))
                 elif self.model_type == "reward":
                     scores = []
@@ -321,7 +346,9 @@ class HFRunner:
     def forward(
         self,
-        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        prompts: Union[
+            List[List[str]], List[str], List[torch.Tensor]
+        ] = DEFAULT_PROMPTS,
         image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
         lora_paths: Optional[List[str]] = None,
@@ -382,13 +409,17 @@ class HFRunner:
                 model = base_model
             outputs = model.generate(
-                input_ids,
-                do_sample=False,
-                temperature=None,
-                top_p=None,
-                max_new_tokens=max_new_tokens,
-                return_dict_in_generate=True,
-                output_scores=(not output_str_only),
+                input_ids=input_ids,
+                generation_config=GenerationConfig(
+                    do_sample=False,
+                    temperature=None,
+                    top_p=None,
+                    max_new_tokens=max_new_tokens,
+                    return_dict_in_generate=True,
+                    output_scores=(not output_str_only),
+                    # make sure to disable compile
+                    disable_compile=True,
+                ),
             )
             text = tokenizer.decode(
@@ -450,6 +481,7 @@ class SRTRunner:
         torch_dtype: torch.dtype,
         model_type: str,
         tp_size: int = 1,
+        impl: str = "auto",
         port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
         lora_paths: List[str] = None,
         max_loras_per_batch: int = 4,
@@ -470,6 +502,7 @@ class SRTRunner:
         speculative_num_draft_tokens: Optional[int] = None,
         disable_overlap_schedule: bool = False,
         disable_custom_all_reduce: bool = False,
+        torchao_config: Optional[str] = None,
     ):
         self.model_type = model_type
         self.is_generation = model_type == "generation"
@@ -488,6 +521,8 @@ class SRTRunner:
             tp_size=tp_size,
             dtype=get_dtype_str(torch_dtype),
             port=port,
+            impl=impl,
+            torchao_config=torchao_config,
             mem_fraction_static=mem_fraction_static,
             trust_remote_code=trust_remote_code,
             is_embedding=not self.is_generation,
@@ -517,7 +552,9 @@ class SRTRunner:
     def forward(
         self,
-        prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        prompts: Union[
+            List[List[str]], List[str], List[torch.Tensor]
+        ] = DEFAULT_PROMPTS,
         image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
         lora_paths: Optional[List[str]] = None,
@@ -543,6 +580,13 @@ class SRTRunner:
                 else:
                     logits = [response["embedding"]]
                 return ModelOutput(embed_logits=logits)
+            # cross encoder model
+            elif self.model_type == "cross_encoder":
+                response = self.engine.rerank(prompts)
+                if not isinstance(response, list):
+                    response = [response]
+                scores = [x["embedding"] for x in response]
+                return ModelOutput(scores=scores)
             # reward model
             else:
                 response = self.engine.encode(prompts)

sglang/test/send_one.py CHANGED Viewed

@@ -127,6 +127,10 @@ def send_one_prompt(args):
     if args.batch_size > 1:
         ret = ret[0]
+    if response.status_code != 200:
+        print(ret)
+        return 0, 0
     latency = ret["meta_info"]["e2e_latency"]
     if "spec_verify_ct" in ret["meta_info"]:

sglang/test/test_block_fp8.py CHANGED Viewed

@@ -343,6 +343,7 @@ class TestW8A8BlockFP8Matmul(CustomTestCase):
         OUT_DTYPES = [torch.bfloat16]
         M = [64, 128, 512, 1024, 4096]
         NKs = [
+            (2112, 7168),
             (1536, 7168),
             (3072, 1536),
             (24576, 7168),

sglang/test/test_block_fp8_deep_gemm_blackwell.py ADDED Viewed

@@ -0,0 +1,252 @@
+import itertools
+import os
+import unittest
+from typing import List, Tuple
+import torch
+from deep_gemm import fp8_gemm_nt
+from sglang.test.test_utils import CustomTestCase
+_is_cuda = torch.cuda.is_available() and torch.version.cuda
+# Modify form DeepGEMM Blackwell
+def ceil_div(x: int, y: int) -> int:
+    return (x + y - 1) // y
+def align(x: int, y: int) -> int:
+    return ceil_div(x, y) * y
+def per_token_group_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    sf = x_amax / 448.0
+    return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
+def per_block_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2)
+    )
+def ceil_to_ue8m0(x: torch.Tensor):
+    assert x.view(-1).amax().item() > 0
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+def per_token_group_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+    return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
+def per_block_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2)
+    )
+# For test
+def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
+    """This function performs matrix multiplication with block-wise quantization using native torch.
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    """
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N,)
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+    A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
+    B_tiles = [
+        [
+            B[
+                j * block_n : min((j + 1) * block_n, N),
+                i * block_k : min((i + 1) * block_k, K),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
+    ]
+    C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
+    As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+def block_quant_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """This function converts block-wise quantization to unquantized.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The output is an unquantized tensor with dtype.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+    x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
+    for j in range(n_tiles):
+        for i in range(k_tiles):
+            x_q_block_tile = x_q_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            x_dq_block_tile = x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
+    return x_dq_block
+class TestDeepGemmBlackwell(CustomTestCase):
+    if not _is_cuda:
+        OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
+        M = [1, 7, 83, 512, 2048]
+        NKs = [
+            (N, K)
+            for N in [128, 512, 1024, 4096, 7748, 13824]
+            for K in [256, 4096, 5120, 3884, 13824]
+        ]
+        # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
+    else:
+        # use practical shape in DeepSeek V3 for test
+        OUT_DTYPES = [torch.bfloat16]
+        M = [64, 128, 512, 1024, 4096]
+        NKs = [
+            (2112, 7168),
+            (1536, 7168),
+            # (3072, 1536),
+            # (24576, 7168),
+            # (4096, 512),
+            # (7168, 2048),
+            # (4608, 7168),
+            # (512, 7168),
+            # (7168, 2304),
+            # (7168, 512),
+        ]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _test_deep_gemm_blackwell(self, M, NK, block_size, out_dtype, seed):
+        N, K = NK
+        torch.manual_seed(seed)
+        A = torch.empty((M, K), dtype=torch.bfloat16).normal_(0, 0.2)
+        B = torch.empty((N, K), dtype=torch.bfloat16).normal_(0, 0.2)
+        A_q, A_s = per_token_group_quant_fp8(A)
+        B_q, B_s = per_block_quant_fp8(B)
+        A_dq = block_quant_dequant(A_q, A_s, [1, block_size[1]], out_dtype)
+        B_dq = block_quant_dequant(B_q, B_s, block_size, out_dtype)
+        A_qu = per_token_group_quant_mxfp8(A_dq)
+        B_qu = per_block_quant_mxfp8(B_dq)
+        out = None
+        with torch.inference_mode():
+            ref_out = native_w8a8_block_fp8_matmul(
+                A_q, B_q, A_s, B_s, block_size, out_dtype
+            )
+            out = torch.empty_like(ref_out)
+            fp8_gemm_nt(A_qu, B_qu, out)
+        torch.testing.assert_close(out, ref_out, atol=1e-1, rtol=1e-2)
+    def test_deep_gemm_blackwell(self):
+        for params in itertools.product(
+            self.M,
+            self.NKs,
+            self.BLOCK_SIZE,
+            self.OUT_DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                M=params[0],
+                NKs=params[1],
+                block_size=params[2],
+                out_dtype=params[3],
+                seed=params[4],
+            ):
+                self._test_deep_gemm_blackwell(*params)
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

sglang/test/test_block_fp8_ep.py CHANGED Viewed

@@ -84,6 +84,7 @@ def ep_moe(
         top_k,
         hidden_states.shape[1],
         BLOCK_SIZE=512,
+        use_per_token_if_dynamic=True,
     )
     seg_indptr_cur_rank = seg_indptr[start_expert_id : end_expert_id + 2]

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl