PyPI - sglang - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl - Mend

sglang 0.4.9py3-none-any.whl → 0.4.9.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +2 -2
sglang/srt/configs/model_config.py +36 -2
sglang/srt/conversation.py +56 -3
sglang/srt/disaggregation/ascend/__init__.py +6 -0
sglang/srt/disaggregation/ascend/conn.py +44 -0
sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
sglang/srt/disaggregation/mooncake/conn.py +50 -18
sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
sglang/srt/disaggregation/utils.py +25 -3
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +1 -0
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +11 -0
sglang/srt/entrypoints/openai/serving_chat.py +7 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/kimik2_detector.py +220 -0
sglang/srt/hf_transformers_utils.py +18 -0
sglang/srt/jinja_template_utils.py +8 -0
sglang/srt/layers/communicator.py +20 -5
sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
sglang/srt/layers/layernorm.py +2 -2
sglang/srt/layers/linear.py +12 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
sglang/srt/layers/moe/ep_moe/layer.py +141 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/topk.py +8 -2
sglang/srt/layers/parameter.py +19 -3
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/fp8.py +28 -7
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -2
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +738 -14
sglang/srt/layers/vocab_parallel_embedding.py +9 -3
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/io_struct.py +35 -3
sglang/srt/managers/mm_utils.py +59 -96
sglang/srt/managers/schedule_batch.py +17 -6
sglang/srt/managers/scheduler.py +38 -6
sglang/srt/managers/tokenizer_manager.py +16 -0
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +176 -101
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/forward_batch_info.py +13 -1
sglang/srt/model_loader/loader.py +23 -12
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +78 -19
sglang/srt/models/deepseek_vl2.py +1 -1
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +6 -3
sglang/srt/models/internvl.py +8 -2
sglang/srt/models/kimi_vl.py +8 -2
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llava.py +3 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpmo.py +1 -2
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral_quant.py +4 -0
sglang/srt/models/mllama4.py +372 -82
sglang/srt/models/phi4mm.py +8 -2
sglang/srt/models/phimoe.py +553 -0
sglang/srt/models/qwen2.py +2 -0
sglang/srt/models/qwen2_5_vl.py +10 -7
sglang/srt/models/qwen2_vl.py +12 -1
sglang/srt/models/vila.py +8 -2
sglang/srt/multimodal/mm_utils.py +2 -2
sglang/srt/multimodal/processors/base_processor.py +197 -137
sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
sglang/srt/multimodal/processors/gemma3.py +4 -2
sglang/srt/multimodal/processors/gemma3n.py +1 -1
sglang/srt/multimodal/processors/internvl.py +1 -1
sglang/srt/multimodal/processors/janus_pro.py +1 -1
sglang/srt/multimodal/processors/kimi_vl.py +1 -1
sglang/srt/multimodal/processors/minicpm.py +4 -3
sglang/srt/multimodal/processors/mllama4.py +63 -61
sglang/srt/multimodal/processors/phi4mm.py +1 -1
sglang/srt/multimodal/processors/pixtral.py +1 -1
sglang/srt/multimodal/processors/qwen_vl.py +203 -80
sglang/srt/multimodal/processors/vila.py +1 -1
sglang/srt/server_args.py +26 -4
sglang/srt/two_batch_overlap.py +3 -0
sglang/srt/utils.py +191 -48
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +6 -4
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +99 -90
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0

sglang/srt/utils.py CHANGED Viewed

@@ -15,7 +15,6 @@
 from __future__ import annotations
-import base64
 import builtins
 import ctypes
 import dataclasses
@@ -68,6 +67,7 @@ from typing import (
 import numpy as np
 import psutil
+import pybase64
 import requests
 import torch
 import torch.distributed
@@ -83,12 +83,7 @@ from torch.func import functional_call
 from torch.library import Library
 from torch.profiler import ProfilerActivity, profile, record_function
 from torch.utils._contextlib import _DecoratorContextManager
-from triton.runtime.cache import (
-    FileCacheManager,
-    default_cache_dir,
-    default_dump_dir,
-    default_override_dir,
-)
+from triton.runtime.cache import FileCacheManager
 logger = logging.getLogger(__name__)
@@ -202,7 +197,7 @@ def get_int_env_var(name: str, default: int = 0) -> int:
 def support_triton(backend: str) -> bool:
-    return backend not in ["torch_native", "intel_amx"]
+    return backend not in ["torch_native", "intel_amx", "ascend"]
 try:
@@ -621,7 +616,7 @@ def decode_video_base64(video_base64):
     from PIL import Image
     # Decode the base64 string
-    video_bytes = base64.b64decode(video_base64)
+    video_bytes = pybase64.b64decode(video_base64, validate=True)
     # Placeholder for the start indices of each PNG image
     img_starts = []
@@ -707,7 +702,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
         audio, original_sr = sf.read(BytesIO(audio_file))
     elif audio_file.startswith("data:"):
         audio_file = audio_file.split(",")[1]
-        audio, original_sr = sf.read(BytesIO(base64.b64decode(audio_file)))
+        audio, original_sr = sf.read(
+            BytesIO(pybase64.b64decode(audio_file, validate=True))
+        )
     elif audio_file.startswith("http://") or audio_file.startswith("https://"):
         timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
         response = requests.get(audio_file, stream=True, timeout=timeout)
@@ -731,33 +728,6 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
     return audio
-def encode_video(video_path, frame_count_limit=None):
-    # Lazy import because decord is not available on some arm platforms.
-    from decord import VideoReader, cpu
-    if not os.path.exists(video_path):
-        logger.error(f"Video {video_path} does not exist")
-        return []
-    if frame_count_limit == 0:
-        return []
-    def uniform_sample(l, n):
-        gap = len(l) / n
-        idxs = [int(i * gap + gap / 2) for i in range(n)]
-        return [l[i] for i in idxs]
-    vr = VideoReader(video_path, ctx=cpu(0))
-    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
-    frame_indices = [i for i in range(0, len(vr), sample_fps)]
-    if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
-        frame_indices = uniform_sample(frame_indices, frame_count_limit)
-    frames = vr.get_batch(frame_indices).asnumpy()
-    frames = [Image.fromarray(v.astype("uint8")) for v in frames]
-    return frames
 def load_image(
     image_file: Union[Image.Image, str, bytes],
 ) -> tuple[Image.Image, tuple[int, int]]:
@@ -776,18 +746,70 @@ def load_image(
         image = Image.open(image_file)
     elif image_file.startswith("data:"):
         image_file = image_file.split(",")[1]
-        image = Image.open(BytesIO(base64.b64decode(image_file)))
-    elif image_file.startswith("video:"):
-        image_file = image_file.replace("video:", "")
-        image, image_size = decode_video_base64(image_file)
+        image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
     elif isinstance(image_file, str):
-        image = Image.open(BytesIO(base64.b64decode(image_file)))
+        image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
     else:
         raise ValueError(f"Invalid image: {image}")
     return image, image_size
+def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
+    # We import decord here to avoid a strange Segmentation fault (core dumped) issue.
+    from decord import VideoReader, cpu, gpu
+    try:
+        from decord.bridge import decord_bridge
+        ctx = gpu(0)
+        _ = decord_bridge.get_ctx_device(ctx)
+    except Exception:
+        ctx = cpu(0)
+    tmp_file = None
+    vr = None
+    try:
+        if isinstance(video_file, bytes):
+            tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+            tmp_file.write(video_file)
+            tmp_file.close()
+            vr = VideoReader(tmp_file.name, ctx=ctx)
+        elif isinstance(video_file, str):
+            if video_file.startswith(("http://", "https://")):
+                timeout = int(os.getenv("REQUEST_TIMEOUT", "10"))
+                response = requests.get(video_file, stream=True, timeout=timeout)
+                response.raise_for_status()
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                for chunk in response.iter_content(chunk_size=8192):
+                    tmp_file.write(chunk)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+            elif video_file.startswith("data:"):
+                _, encoded = video_file.split(",", 1)
+                video_bytes = base64.b64decode(encoded)
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                tmp_file.write(video_bytes)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+            elif os.path.isfile(video_file):
+                vr = VideoReader(video_file, ctx=ctx)
+            else:
+                video_bytes = base64.b64decode(video_file)
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                tmp_file.write(video_bytes)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+        else:
+            raise ValueError(f"Unsupported video input type: {type(video_file)}")
+        return vr
+    finally:
+        if tmp_file and os.path.exists(tmp_file.name):
+            os.unlink(tmp_file.name)
 def suppress_other_loggers():
     warnings.filterwarnings(
         "ignore", category=UserWarning, message="The given NumPy array is not writable"
@@ -923,18 +945,41 @@ class CustomCacheManager(FileCacheManager):
         self.key = key
         self.lock_path = None
+        try:
+            module_path = "triton.runtime.cache"
+            cache_module = importlib.import_module(module_path)
+            default_cache_dir = getattr(cache_module, "default_cache_dir", None)
+            default_dump_dir = getattr(cache_module, "default_dump_dir", None)
+            default_override_dir = getattr(cache_module, "default_override_dir", None)
+        except (ModuleNotFoundError, AttributeError) as e:
+            default_cache_dir = None
+            default_dump_dir = None
+            default_override_dir = None
         if dump:
-            self.cache_dir = default_dump_dir()
+            self.cache_dir = (
+                default_dump_dir()
+                if default_dump_dir is not None
+                else os.path.join(Path.home(), ".triton", "dump")
+            )
             self.cache_dir = os.path.join(self.cache_dir, self.key)
             self.lock_path = os.path.join(self.cache_dir, "lock")
             os.makedirs(self.cache_dir, exist_ok=True)
         elif override:
-            self.cache_dir = default_override_dir()
+            self.cache_dir = (
+                default_override_dir()
+                if default_override_dir is not None
+                else os.path.join(Path.home(), ".triton", "override")
+            )
             self.cache_dir = os.path.join(self.cache_dir, self.key)
         else:
             # create cache directory if it doesn't exist
-            self.cache_dir = (
-                os.getenv("TRITON_CACHE_DIR", "").strip() or default_cache_dir()
+            self.cache_dir = os.getenv("TRITON_CACHE_DIR", "").strip() or (
+                default_cache_dir()
+                if default_cache_dir is not None
+                else os.path.join(Path.home(), ".triton", "cache")
             )
             if self.cache_dir:
                 try:
@@ -1848,7 +1893,7 @@ class MultiprocessingSerializer:
         if output_str:
             # Convert bytes to base64-encoded string
-            output = base64.b64encode(output).decode("utf-8")
+            output = pybase64.b64encode(output).decode("utf-8")
         return output
@@ -1865,7 +1910,7 @@ class MultiprocessingSerializer:
         """
         if isinstance(data, str):
             # Decode base64 string to bytes
-            data = base64.b64decode(data)
+            data = pybase64.b64decode(data, validate=True)
         return ForkingPickler.loads(data)
@@ -2737,3 +2782,101 @@ def lru_cache_frozenset(maxsize=128):
         return wrapper
     return decorator
+def apply_module_patch(target_module, target_function, wrappers):
+    original_module, original_function = parse_module_path(
+        target_module, target_function, False
+    )
+    original_function_id = id(original_function)
+    candidate = original_function
+    for wrapper in wrappers:
+        candidate = wrapper(candidate)
+    if target_function is not None:
+        setattr(original_module, target_function, candidate)
+    for key, value in sys.modules.copy().items():
+        if (
+            target_function is not None
+            and hasattr(value, target_function)
+            and id(getattr(value, target_function)) == original_function_id
+        ):
+            setattr(value, target_function, candidate)
+def parse_module_path(module_path, function_name, create_dummy):
+    from importlib.machinery import ModuleSpec
+    def create_dummy_module(full_path, parent=None):
+        """Create and register a placeholder module"""
+        dummy = types.ModuleType(full_path)
+        dummy.__file__ = "vllm_ascend.dummy_module.py"
+        dummy.__spec__ = ModuleSpec(full_path, None)
+        sys.modules[full_path] = dummy
+        if parent:
+            setattr(parent, full_path.split(".")[-1], dummy)
+        return dummy
+    def create_placeholder_function(func_name):
+        """Create dummy function that raises when called"""
+        def placeholder(*args, **kwargs):
+            raise NotImplementedError(f"Function {func_name} is a placeholder")
+        placeholder.__name__ = func_name
+        return placeholder
+    modules = module_path.split(".")
+    current_module = None
+    processed_path = []
+    for idx, part in enumerate(modules):
+        current_path = ".".join(modules[: idx + 1])
+        parent_path = ".".join(modules[:idx]) if idx > 0 else None
+        try:
+            current_module = importlib.import_module(current_path)
+        except ModuleNotFoundError:
+            # Handle missing module
+            parent = importlib.import_module(parent_path) if parent_path else None
+            if parent and hasattr(parent, part):
+                # Use existing attribute from parent
+                current_module = getattr(parent, part)
+                # Check for early function resolution
+                if function_name and hasattr(current_module, function_name):
+                    return current_module, getattr(current_module, function_name)
+                if function_name and create_dummy:
+                    ph_func = create_placeholder_function(function_name)
+                    setattr(current_module, function_name, ph_func)
+                    return current_module, ph_func
+                if function_name:
+                    raise AttributeError(
+                        f"Function {function_name} missing in {current_path}"
+                    )
+            else:
+                if not create_dummy:
+                    raise
+                # Create and register dummy module
+                current_module = create_dummy_module(
+                    current_path,
+                    parent=(
+                        importlib.import_module(parent_path) if parent_path else None
+                    ),
+                )
+        processed_path.append(part)
+    # Final function handling
+    final_module = sys.modules[module_path]
+    if function_name is not None:
+        if not hasattr(final_module, function_name):
+            if create_dummy:
+                ph_func = create_placeholder_function(function_name)
+                setattr(final_module, function_name, ph_func)
+            else:
+                setattr(final_module, function_name, None)
+        return final_module, getattr(final_module, function_name)
+    return final_module, None

sglang/test/test_cutlass_w4a8_moe.py ADDED Viewed

@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+import pytest
+import torch
+from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
+from sglang.srt.layers.moe.topk import select_experts
+def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
+    if int4_values_interleaved.shape[-1] % 2 != 0:
+        raise ValueError(
+            "the last dim size of int4_values_interleaved tensor must be even."
+        )
+    input_tensor_int8 = int4_values_interleaved.to(torch.int8)
+    low_nibbles = input_tensor_int8[..., 0::2]
+    high_nibbles = input_tensor_int8[..., 1::2]
+    packed_tensor = (high_nibbles << 4) | (low_nibbles & 0x0F)
+    return packed_tensor.to(torch.int8)
+def pack_interleave(num_experts, ref_weight, ref_scale):
+    n, k = ref_weight.shape[1], ref_weight.shape[2]
+    weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
+    w_q = weight.view((num_experts, n, k // 2)).view(torch.int8)
+    w_q = w_q.contiguous()
+    scale_interleaved = ref_scale.reshape(
+        ref_scale.shape[0], ref_scale.shape[1], (ref_scale.shape[2] // 4), 4
+    )  # [E, N, K/4, 4]
+    scale_interleaved = scale_interleaved.permute(0, 2, 1, 3)  # [E, K/4, N, 4]
+    scale_interleaved = scale_interleaved.reshape(
+        ref_scale.shape[0], ref_scale.shape[2] // 4, ref_scale.shape[1] * 4
+    )  # [E, K/4, N*4]
+    w_scale = scale_interleaved.contiguous()
+    return w_q, w_scale
+@pytest.mark.parametrize("M", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("N", [2048])
+@pytest.mark.parametrize("K", [7168])
+@pytest.mark.parametrize("E", [256])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.parametrize("topk", [8])
+@pytest.mark.parametrize("group_size", [128])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
+    local_e = E // ep_size
+    debug = False
+    if debug:
+        a = torch.ones((M, K), dtype=dtype, device="cuda") * 0.001
+        ref_weight_1 = torch.ones((local_e, N * 2, K), dtype=torch.int8, device="cuda")
+        ref_weight_2 = torch.ones((local_e, K, N), dtype=torch.int8, device="cuda")
+        a1_scale = torch.ones(1, dtype=torch.float32, device="cuda")
+        a2_scale = torch.ones(1, dtype=torch.float32, device="cuda")
+        scale_1 = torch.ones(
+            (local_e, N * 2, K // group_size), dtype=dtype, device="cuda"
+        )
+        scale_2 = torch.ones((local_e, K, N // group_size), dtype=dtype, device="cuda")
+    else:
+        a = torch.randn(M, K, dtype=dtype, device="cuda")
+        ref_weight_1 = torch.randint(
+            -8, 8, (local_e, N * 2, K), dtype=torch.int8, device="cuda"
+        )
+        ref_weight_2 = torch.randint(
+            -8, 8, (local_e, K, N), dtype=torch.int8, device="cuda"
+        )
+        affine_coeff = 0.005
+        a1_scale = torch.randn(1, dtype=torch.float32, device="cuda")
+        a2_scale = torch.randn(1, dtype=torch.float32, device="cuda")
+        scale_1 = (
+            torch.randn(local_e, N * 2, K // group_size, dtype=dtype, device="cuda")
+            * affine_coeff
+        )
+        scale_2 = (
+            torch.randn(local_e, K, N // group_size, dtype=dtype, device="cuda")
+            * affine_coeff
+        )
+    w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
+    w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
+    device = "cuda"
+    a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
+    c_strides1 = torch.full((local_e, 3), 2 * N, device=device, dtype=torch.int64)
+    a_strides2 = torch.full((local_e, 3), N, device=device, dtype=torch.int64)
+    c_strides2 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
+    b_strides1 = a_strides1
+    s_strides13 = c_strides1
+    b_strides2 = a_strides2
+    s_strides2 = c_strides2
+    score = torch.randn((M, E), dtype=dtype, device=device)
+    topk_weights, topk_ids = select_experts(
+        hidden_states=a,
+        router_logits=score,
+        top_k=topk,
+        use_grouped_topk=False,
+        renormalize=False,
+    )
+    expert_map = torch.arange(E, dtype=torch.int32, device=device)
+    expert_map[local_e:] = E
+    output = cutlass_moe(
+        a,
+        w1_q,
+        w2_q,
+        w1_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides13,
+        s_strides2,
+        0,
+        local_e - 1,
+        E,
+        a1_scale,
+        a2_scale,
+        expert_map,
+    )
+    ref_output = ref(
+        a,
+        local_e,
+        topk_weights,
+        topk_ids,
+        ref_weight_1,
+        ref_weight_2,
+        scale_1,
+        scale_2,
+        has_pre_quant=True,
+        has_alpha=True,
+        pre_quant_scale_1=a1_scale,
+        pre_quant_scale_2=a2_scale,
+        alpha_1=a1_scale,
+        alpha_2=a2_scale,
+    )
+    # compare
+    torch.cuda.synchronize()
+    # compare final output
+    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
+    print("SUCCESS: Final output tensors are close.")
+def cutlass_moe(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids_: torch.Tensor,
+    a_strides1: torch.Tensor,
+    b_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides13: torch.Tensor,
+    s_strides2: torch.Tensor,
+    start_expert_id: int,
+    end_expert_id: int,
+    E: int,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+):
+    local_topk_ids = topk_ids_
+    local_topk_ids = torch.where(expert_map[topk_ids_] != E, expert_map[topk_ids_], E)
+    device = a.device
+    local_num_experts = end_expert_id - start_expert_id + 1
+    expert_offsets = torch.empty(
+        (local_num_experts + 1), dtype=torch.int32, device=device
+    )
+    problem_sizes1 = torch.empty(
+        (local_num_experts, 3), dtype=torch.int32, device=device
+    )
+    problem_sizes2 = torch.empty(
+        (local_num_experts, 3), dtype=torch.int32, device=device
+    )
+    return cutlass_w4a8_moe(
+        start_expert_id,
+        end_expert_id,
+        E,
+        a,
+        w1_q,
+        w2_q,
+        w1_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids_,
+        local_topk_ids,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides13,
+        s_strides2,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a1_scale,
+        a2_scale,
+        apply_router_weight_on_input,
+    )
+def ref(
+    x: torch.Tensor,
+    num_experts: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    ref_weight_1: torch.Tensor,
+    ref_weight_2: torch.Tensor,
+    ref_weight_scale_1: torch.Tensor,
+    ref_weight_scale_2: torch.Tensor,
+    has_pre_quant: bool = False,
+    has_alpha: bool = False,
+    pre_quant_scale_1: Optional[torch.Tensor] = None,
+    pre_quant_scale_2: Optional[torch.Tensor] = None,
+    alpha_1: Optional[torch.Tensor] = None,
+    alpha_2: Optional[torch.Tensor] = None,
+):
+    results = torch.zeros_like(x)
+    dtype = x.dtype
+    for e_idx in range(num_experts):
+        mask = topk_ids == e_idx
+        activated_tokens = mask.sum(1).bool()
+        act = x[activated_tokens, :]
+        if act.shape[0] == 0:
+            continue
+        final_scale = (topk_weights * mask).sum(1)[activated_tokens].unsqueeze(1)
+        act = (
+            torch.clamp((act / pre_quant_scale_1.float()), -448.0, 448.0)
+            .to(torch.float8_e4m3fn)
+            .to(dtype)
+        )
+        w3_w1 = ref_weight_1[e_idx]
+        ref_w_scale_repeat = (
+            ref_weight_scale_1[e_idx].repeat_interleave(128, dim=1).to(float)
+        )
+        w3_w1 = (w3_w1.to(float) * ref_w_scale_repeat).to(dtype)
+        fc1 = ((torch.matmul(act, w3_w1.T)) * alpha_1).to(torch.float16)
+        gate, fc1 = fc1.chunk(2, dim=-1)
+        fc1 = fc1 * torch.nn.functional.silu(gate)
+        act = (fc1 / pre_quant_scale_2.float()).to(torch.float8_e4m3fn)
+        act = act.to(dtype)
+        w2 = ref_weight_2[e_idx]
+        ref_w_scale_repeat = (
+            ref_weight_scale_2[e_idx].repeat_interleave(128, dim=1).to(float)
+        )
+        w2 = (w2.to(float) * ref_w_scale_repeat).to(dtype)
+        fc2 = (torch.matmul(act, w2.T) * alpha_2).to(torch.float16)
+        results[activated_tokens, :] += (fc2 * final_scale).to(results.dtype)
+    return results

sglang/utils.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Common utilities"""
-import base64
 import importlib
 import json
 import logging
@@ -20,6 +19,7 @@ from json import dumps
 from typing import Any, Callable, List, Optional, Tuple, Type, Union
 import numpy as np
+import pybase64
 import requests
 from IPython.display import HTML, display
 from pydantic import BaseModel
@@ -148,15 +148,15 @@ def encode_image_base64(image_path: Union[str, bytes]):
     if isinstance(image_path, str):
         with open(image_path, "rb") as image_file:
             data = image_file.read()
-            return base64.b64encode(data).decode("utf-8")
+            return pybase64.b64encode(data).decode("utf-8")
     elif isinstance(image_path, bytes):
-        return base64.b64encode(image_path).decode("utf-8")
+        return pybase64.b64encode(image_path).decode("utf-8")
     else:
         # image_path is PIL.WebPImagePlugin.WebPImageFile
         image = image_path
         buffered = BytesIO()
         image.save(buffered, format="PNG")
-        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return pybase64.b64encode(buffered.getvalue()).decode("utf-8")
 def encode_frame(frame):
@@ -223,7 +223,7 @@ def encode_video_base64(video_path: str, num_frames: int = 16):
     video_bytes = b"".join(encoded_frames)
     # Encode the concatenated bytes to base64
-    video_base64 = "video:" + base64.b64encode(video_bytes).decode("utf-8")
+    video_base64 = "video:" + pybase64.b64encode(video_bytes).decode("utf-8")
     return video_base64

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.9"
1	+ __version__ = "0.4.9.post2"

sglang 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

sglang 0.4.9py3-none-any.whl → 0.4.9.post2py3-none-any.whl