PyPI - sglang - Versions diffs - 0.4.3.post3__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

sglang 0.4.3.post3py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

sglang/srt/utils.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """Common utilities."""
 import base64
+import builtins
 import ctypes
 import dataclasses
 import io
@@ -37,6 +38,7 @@ import time
 import warnings
 from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
+from importlib.util import find_spec
 from io import BytesIO
 from multiprocessing import Pool
 from multiprocessing.reduction import ForkingPickler
@@ -52,11 +54,13 @@ import triton
 import zmq
 from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
+from packaging.version import Version, parse
 from starlette.routing import Mount
 from torch import nn
 from torch.func import functional_call
 from torch.library import Library
 from torch.profiler import ProfilerActivity, profile, record_function
+from torch.utils.cpp_extension import CUDA_HOME
 from triton.runtime.cache import (
     FileCacheManager,
     default_cache_dir,
@@ -69,14 +73,31 @@ logger = logging.getLogger(__name__)
 show_time_cost = False
 time_infos = {}
+HIP_FP8_E4M3_FNUZ_MAX = 224.0
+# https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
 def is_hip() -> bool:
-    """Return whether it is HIP on the AMD ROCm platform."""
     return torch.version.hip is not None
+if is_hip():
+    FP8_E4M3_MAX = HIP_FP8_E4M3_FNUZ_MAX
+else:
+    FP8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+FP8_E4M3_MIN = -FP8_E4M3_MAX
+builtins.FP8_E4M3_MAX = FP8_E4M3_MAX
+builtins.FP8_E4M3_MIN = FP8_E4M3_MIN
+def is_rocm() -> bool:
+    return torch.cuda.is_available() and torch.version.hip
 def is_cuda():
-    return hasattr(torch, "cuda") and torch.version.cuda is not None
+    return torch.cuda.is_available() and torch.version.cuda
 def is_cuda_alike():
@@ -98,11 +119,11 @@ def is_flashinfer_available():
     """
     if not get_bool_env_var("SGLANG_IS_FLASHINFER_AVAILABLE", default="true"):
         return False
-    return torch.cuda.is_available() and torch.version.cuda
+    return is_cuda()
 def is_cuda_available():
-    return torch.cuda.is_available() and torch.version.cuda
+    return is_cuda()
 def enable_show_time_cost():
@@ -1045,6 +1066,65 @@ def get_device_name(device_id: int = 0) -> str:
         return torch.hpu.get_device_name(device_id)
+@lru_cache(maxsize=1)
+def is_habana_available() -> bool:
+    return find_spec("habana_frameworks") is not None
+@lru_cache(maxsize=8)
+def get_device(device_id: Optional[int] = None) -> str:
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        if device_id is None:
+            return "cuda"
+        return "cuda:{}".format(device_id)
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        if device_id == None:
+            return "xpu"
+        return "xpu:{}".format(device_id)
+    if is_habana_available():
+        try:
+            import habana_frameworks.torch.hpu
+            if torch.hpu.is_available():
+                if device_id == None:
+                    return "hpu"
+                return "hpu:{}".format(device_id)
+        except ImportError as e:
+            raise ImportError(
+                "Habana frameworks detected, but failed to import 'habana_frameworks.torch.hpu'."
+            )
+    raise RuntimeError("No accelerator (CUDA, XPU, HPU) is available.")
+@lru_cache(maxsize=1)
+def get_device_count() -> int:
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        try:
+            return torch.cuda.device_count()
+        except RuntimeError:
+            return 0
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        try:
+            return torch.xpu.device_count()
+        except RuntimeError:
+            return 0
+    if is_habana_available():
+        try:
+            import habana_frameworks.torch.hpu
+            if torch.hpu.is_available():
+                return torch.hpu.device_count()
+        except (ImportError, RuntimeError):
+            return 0
+    return 0  # No accelerators available
 def get_device_core_count(device_id: int = 0) -> int:
     if hasattr(torch, "cuda") and torch.cuda.is_available():
         return torch.cuda.get_device_properties(device_id).multi_processor_count
@@ -1063,11 +1143,12 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
         )
         major, minor = int(major), int(minor)
-    # TODO(HandH1998): `get_device_capability` is not supported by `torch.hpu` for now.
-    # Update this once the support is available.
     if hasattr(torch, "hpu") and torch.hpu.is_available():
         try:
-            major, minor = torch.hpu.get_device_capability(device_id)
+            # TODO(HandH1998): `get_device_capability` is not supported by `torch.hpu` for now.
+            # Update this once the support is available.
+            # major, minor = torch.hpu.get_device_capability(device_id)
+            major, minor = None, None
         except Exception as e:
             raise RuntimeError(
                 f"An error occurred while getting device capability of hpu: {e}."
@@ -1269,7 +1350,8 @@ def permute_weight(x: torch.Tensor) -> torch.Tensor:
     elif x.dtype == torch.float8_e4m3fnuz or x.dtype == torch.int8:
         x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 64), 4, 16)
     else:
-        return x_
+        # return x_
+        x_ = x_.view(int(b_), int(n_ / 16), 16, int(k_ / 8), 2, 4)
     x_ = x_.permute(0, 1, 3, 4, 2, 5)
     x_ = x_.contiguous()
@@ -1341,7 +1423,7 @@ def kill_itself_when_parent_died():
         libc = ctypes.CDLL("libc.so.6")
         libc.prctl(PR_SET_PDEATHSIG, signal.SIGKILL)
     else:
-        logger.warninig("kill_itself_when_parent_died is only supported in linux.")
+        logger.warning("kill_itself_when_parent_died is only supported in linux.")
 def set_uvicorn_logging_configs():
@@ -1430,6 +1512,12 @@ def rank0_print(msg: str):
         print(msg, flush=True)
+def get_cuda_version():
+    if torch.version.cuda:
+        return tuple(map(int, torch.version.cuda.split(".")))
+    return (0, 0)
 def launch_dummy_health_check_server(host, port):
     import uvicorn
     from fastapi import FastAPI, Response
@@ -1466,6 +1554,13 @@ def set_cuda_arch():
         os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
+def next_power_of_2(n: int):
+    return 1 << (n - 1).bit_length() if n > 0 else 1
+setattr(triton, "next_power_of_2", next_power_of_2)
 def add_prefix(name: str, prefix: str) -> str:
     """Add a weight path prefix to a module name.

sglang/test/runners.py CHANGED Viewed

@@ -19,7 +19,7 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.server import Engine
@@ -135,6 +135,76 @@ class HFRunner:
             return True
         return False
+    # copy from https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct/blob/main/gme_inference.py
+    def _get_gme_qwen2_vl_embeddings(
+        self, prompts, image_data: Optional[List[str]] = None
+    ):
+        from sglang.srt.utils import load_image
+        images = None
+        if image_data is not None:
+            images = [load_image(image)[0] for image in image_data]
+        inputs = self.processor(
+            text=prompts,
+            images=images,
+            padding=True,
+            truncation=True,
+            max_length=1800,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            embeddings = self._forward_gme_qwen2_vl(**inputs)
+        return embeddings.tolist()
+    def _forward_gme_qwen2_vl(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pooling_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.model.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.model.visual.get_dtype())
+                image_embeds = self.model.visual(
+                    pixel_values, grid_thw=image_grid_thw
+                ).to(inputs_embeds.device)
+                image_mask = input_ids == self.model.config.image_token_id
+                inputs_embeds[image_mask] = image_embeds
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+        outputs = self.model.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+        )
+        pooling_mask = attention_mask if pooling_mask is None else pooling_mask
+        left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0]  # TODO
+        if left_padding:
+            embeddings = outputs.last_hidden_state[:, -1]
+        else:
+            sequence_lengths = pooling_mask.sum(dim=1) - 1
+            batch_size = outputs.last_hidden_state.shape[0]
+            embeddings = outputs.last_hidden_state[
+                torch.arange(batch_size, device=outputs.last_hidden_state.device),
+                sequence_lengths,
+            ]
+        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+        return embeddings.contiguous()
     def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
         # Apply model-specific patches
         monkey_patch_gemma2_sdpa()
@@ -148,9 +218,18 @@ class HFRunner:
                 low_cpu_mem_usage=True,
             ).cuda()
         elif self.model_type == "embedding":
-            self.model = _get_sentence_transformer_embedding_model(
-                model_path, torch_dtype
-            )
+            if "gme-qwen2-vl" in model_path.lower():
+                self.model = AutoModelForVision2Seq.from_pretrained(
+                    model_path,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=False,
+                    low_cpu_mem_usage=True,
+                ).cuda()
+                self.processor = AutoProcessor.from_pretrained(model_path)
+            else:
+                self.model = _get_sentence_transformer_embedding_model(
+                    model_path, torch_dtype
+                )
         elif self.model_type == "reward":
             from transformers import AutoModelForSequenceClassification
@@ -169,7 +248,9 @@ class HFRunner:
         # Run forward
         while True:
-            prompts, max_new_tokens, lora_paths, token_ids_logprob = in_queue.get()
+            prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob = (
+                in_queue.get()
+            )
             if lora_paths is not None:
                 assert len(prompts) == len(lora_paths)
@@ -189,7 +270,10 @@ class HFRunner:
                     )
                 elif self.model_type == "embedding":
                     assert not self.output_str_only
-                    logits = self.model.encode(prompts).tolist()
+                    if "gme-qwen2-vl" in model_path.lower():
+                        logits = self._get_gme_qwen2_vl_embeddings(prompts, image_data)
+                    else:
+                        logits = self.model.encode(prompts).tolist()
                     out_queue.put(ModelOutput(embed_logits=logits))
                 elif self.model_type == "reward":
@@ -211,11 +295,14 @@ class HFRunner:
     def forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
         lora_paths: Optional[List[str]] = None,
         token_ids_logprob: Optional[int] = None,
     ):
-        self.in_queue.put((prompts, max_new_tokens, lora_paths, token_ids_logprob))
+        self.in_queue.put(
+            (prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob)
+        )
         return self.out_queue.get()
     def terminate(self):
@@ -396,6 +483,7 @@ class SRTRunner:
     def forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
         lora_paths: Optional[List[str]] = None,
         logprob_start_len: int = 0,
@@ -413,17 +501,23 @@ class SRTRunner:
                 token_ids_logprob=token_ids_logprob,
             )
         else:
-            response = self.engine.encode(prompts)
             if self.model_type == "embedding":
-                logits = [x["embedding"] for x in response]
+                response = self.engine.encode(prompt=prompts, image_data=image_data)
+                if isinstance(response, list):
+                    logits = [x["embedding"] for x in response]
+                else:
+                    logits = [response["embedding"]]
                 return ModelOutput(embed_logits=logits)
+            # reward model
             else:
+                response = self.engine.encode(prompts)
                 scores = [x["embedding"][0] for x in response]
                 return ModelOutput(scores=scores)
     def batch_forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
         max_new_tokens=8,
         lora_paths=None,
     ):
@@ -439,7 +533,7 @@ class SRTRunner:
                 lora_paths=lora_paths,
             )
         else:
-            response = self.engine.encode(prompts)
+            response = self.engine.encode(prompts, image_data)
             if self.model_type == "embedding":
                 logits = [x["embedding"] for x in response]
                 return ModelOutput(embed_logits=logits)

sglang/test/test_block_fp8.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import itertools
+import os
 import unittest
 import torch
@@ -7,9 +8,12 @@ from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
 from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
+    static_quant_fp8,
     w8a8_block_fp8_matmul,
 )
+_is_cuda = torch.cuda.is_available() and torch.version.cuda
 # For test
 def native_per_token_group_quant_fp8(
@@ -63,7 +67,7 @@ class TestPerTokenGroupQuantFP8(unittest.TestCase):
             out, scale = per_token_group_quant_fp8(x, group_size)
         self.assertTrue(
-            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
+            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.20)
         )
         self.assertTrue(torch.allclose(scale, ref_scale))
@@ -85,6 +89,71 @@ class TestPerTokenGroupQuantFP8(unittest.TestCase):
                 self._per_token_group_quant_fp8(*params)
+# For test
+def native_static_quant_fp8(x, x_s, dtype=torch.float8_e4m3fn):
+    """Function to perform static quantization on an input tensor `x` using native torch.
+    It converts the tensor values into float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    """
+    assert x.is_contiguous(), "`x` is not contiguous"
+    assert x_s.numel() == 1, "only supports per-tensor scale"
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+    x_ = x.reshape(x.numel() // x.shape[-1], x.shape[-1])
+    x_s_inv = 1.0 / x_s
+    x_q = (x_ * x_s_inv).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    return x_q, x_s
+class TestStaticQuantFP8(unittest.TestCase):
+    DTYPES = [torch.half, torch.bfloat16, torch.float32]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _static_quant_fp8(self, num_tokens, d, dtype, seed):
+        torch.manual_seed(seed)
+        x = torch.rand(num_tokens, d, dtype=dtype)
+        fp8_max = torch.finfo(torch.float8_e4m3fn).max
+        x_s = x.max() / fp8_max
+        with torch.inference_mode():
+            ref_out, _ = native_static_quant_fp8(x, x_s)
+            out, _ = static_quant_fp8(x, x_s, repeat_scale=True)
+        self.assertTrue(
+            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.50)
+        )
+    def test_static_quant_fp8(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                seed=params[3],
+            ):
+                self._static_quant_fp8(*params)
 # For test
 def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
     """This function performs matrix multiplication with block-wise quantization using native torch.
@@ -142,13 +211,35 @@ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.fl
 class TestW8A8BlockFP8Matmul(unittest.TestCase):
-    OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
-    M = [1, 7, 83, 512, 2048]
-    N = [128, 512, 1024, 4096, 7748, 13824]
-    K = [256, 4096, 5120, 3884, 13824]
-    # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
-    BLOCK_SIZE = [[128, 128]]
-    SEEDS = [0]
+    if not _is_cuda:
+        OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
+        M = [1, 7, 83, 512, 2048]
+        NKs = [
+            (N, K)
+            for N in [128, 512, 1024, 4096, 7748, 13824]
+            for K in [256, 4096, 5120, 3884, 13824]
+        ]
+        # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
+    else:
+        # use practical shape in DeepSeek V3 for test
+        OUT_DTYPES = [torch.bfloat16]
+        M = [64, 128, 512, 1024, 4096]
+        NKs = [
+            (1536, 7168),
+            (3072, 1536),
+            (24576, 7168),
+            (4096, 512),
+            (7168, 2048),
+            (4608, 7168),
+            (512, 7168),
+            (7168, 2304),
+            (7168, 512),
+        ]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
     @classmethod
     def setUpClass(cls):
@@ -156,7 +247,8 @@ class TestW8A8BlockFP8Matmul(unittest.TestCase):
             raise unittest.SkipTest("CUDA is not available")
         torch.set_default_device("cuda")
-    def _w8a8_block_fp8_matmul(self, M, N, K, block_size, out_dtype, seed):
+    def _w8a8_block_fp8_matmul(self, M, NK, block_size, out_dtype, seed):
+        N, K = NK
         torch.manual_seed(seed)
         # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
         factor_for_scale = 1e-2
@@ -191,19 +283,17 @@ class TestW8A8BlockFP8Matmul(unittest.TestCase):
     def test_w8a8_block_fp8_matmul(self):
         for params in itertools.product(
             self.M,
-            self.N,
-            self.K,
+            self.NKs,
             self.BLOCK_SIZE,
             self.OUT_DTYPES,
             self.SEEDS,
         ):
             with self.subTest(
                 M=params[0],
-                N=params[1],
-                K=params[2],
-                block_size=params[3],
-                out_dtype=params[4],
-                seed=params[5],
+                NKs=params[1],
+                block_size=params[2],
+                out_dtype=params[3],
+                seed=params[4],
             ):
                 self._w8a8_block_fp8_matmul(*params)

sglang/test/test_custom_ops.py ADDED Viewed

@@ -0,0 +1,88 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/8ca7a71df787ad711ad3ac70a5bd2eb2bb398938/tests/quantization/test_fp8.py
+import pytest
+import torch
+from sglang.srt.custom_op import scaled_fp8_quant
+from sglang.srt.utils import is_cuda
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_scaled_fp8_quant_per_tensor(dtype) -> None:
+    def quantize_ref_per_tensor(tensor, inv_scale):
+        # The reference implementation that fully aligns to
+        # the kernel being tested.
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        scale = inv_scale.reciprocal()
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
+        qweight = qweight.to(torch.float8_e4m3fn)
+        return qweight
+    def dequantize_per_tensor(tensor, inv_scale, dtype):
+        fake_qweight = tensor.to(dtype)
+        dq_weight = fake_qweight * inv_scale
+        return dq_weight
+    # Note that we use a shape % 8 != 0 to cover edge cases,
+    # because scaled_fp8_quant is vectorized by 8.
+    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
+    # Test Per Tensor Dynamic quantization
+    # scale = max(abs(x)) / FP8_E4M3_MAX
+    y, scale = scaled_fp8_quant(x, None)
+    ref_y = quantize_ref_per_tensor(x, scale)
+    torch.testing.assert_close(y, ref_y)
+    torch.testing.assert_close(
+        dequantize_per_tensor(y, scale, dtype),
+        dequantize_per_tensor(ref_y, scale, dtype),
+    )
+    # Test Per Tensor Static quantization
+    y, _ = scaled_fp8_quant(x, scale)
+    ref_y = quantize_ref_per_tensor(x, scale)
+    torch.testing.assert_close(y, ref_y)
+    torch.testing.assert_close(
+        dequantize_per_tensor(y, scale, dtype),
+        dequantize_per_tensor(ref_y, scale, dtype),
+    )
+if is_cuda:
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_scaled_fp8_quant_per_token_dynamic(dtype) -> None:
+        def quantize_ref_per_token(tensor, inv_scale):
+            # The reference implementation that fully aligns to
+            # the kernel being tested.
+            finfo = torch.finfo(torch.float8_e4m3fn)
+            scale = inv_scale.reciprocal()
+            qweight = (tensor.to(torch.float32) * scale).clamp(
+                min=finfo.min, max=finfo.max
+            )
+            qweight = qweight.to(torch.float8_e4m3fn)
+            return qweight
+        def dequantize_per_token(tensor, inv_scale, dtype):
+            fake_qweight = tensor.to(dtype)
+            dq_weight = fake_qweight * inv_scale
+            return dq_weight
+        # Note that we use a shape % 8 = 0,
+        # because per_token_quant_fp8 is vectorized by 8 elements.
+        x = (torch.randn(size=(11, 16), device="cuda") * 13).to(dtype)
+        # Test Per Tensor Dynamic quantization
+        # scale = max(abs(x)) / FP8_E4M3_MAX
+        y, scale = scaled_fp8_quant(x, None, use_per_token_if_dynamic=True)
+        ref_y = quantize_ref_per_token(x, scale)
+        torch.testing.assert_close(y, ref_y)
+        torch.testing.assert_close(
+            dequantize_per_token(y, scale, dtype),
+            dequantize_per_token(ref_y, scale, dtype),
+        )
+if __name__ == "__main__":
+    # Run the specific test function directly
+    pytest.main([__file__])

sglang 0.4.3.post3__py3-none-any.whl → 0.4.4__py3-none-any.whl

sglang 0.4.3.post3py3-none-any.whl → 0.4.4py3-none-any.whl