PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/test/runners.py CHANGED Viewed

@@ -19,7 +19,7 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.server import Engine
@@ -135,6 +135,76 @@ class HFRunner:
             return True
         return False
+    # copy from https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct/blob/main/gme_inference.py
+    def _get_gme_qwen2_vl_embeddings(
+        self, prompts, image_data: Optional[List[str]] = None
+    ):
+        from sglang.srt.utils import load_image
+        images = None
+        if image_data is not None:
+            images = [load_image(image)[0] for image in image_data]
+        inputs = self.processor(
+            text=prompts,
+            images=images,
+            padding=True,
+            truncation=True,
+            max_length=1800,
+            return_tensors="pt",
+        )
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            embeddings = self._forward_gme_qwen2_vl(**inputs)
+        return embeddings.tolist()
+    def _forward_gme_qwen2_vl(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pooling_mask: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.model.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.model.visual.get_dtype())
+                image_embeds = self.model.visual(
+                    pixel_values, grid_thw=image_grid_thw
+                ).to(inputs_embeds.device)
+                image_mask = input_ids == self.model.config.image_token_id
+                inputs_embeds[image_mask] = image_embeds
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+        outputs = self.model.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+        )
+        pooling_mask = attention_mask if pooling_mask is None else pooling_mask
+        left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0]  # TODO
+        if left_padding:
+            embeddings = outputs.last_hidden_state[:, -1]
+        else:
+            sequence_lengths = pooling_mask.sum(dim=1) - 1
+            batch_size = outputs.last_hidden_state.shape[0]
+            embeddings = outputs.last_hidden_state[
+                torch.arange(batch_size, device=outputs.last_hidden_state.device),
+                sequence_lengths,
+            ]
+        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+        return embeddings.contiguous()
     def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
         # Apply model-specific patches
         monkey_patch_gemma2_sdpa()
@@ -148,9 +218,18 @@ class HFRunner:
                 low_cpu_mem_usage=True,
             ).cuda()
         elif self.model_type == "embedding":
-            self.model = _get_sentence_transformer_embedding_model(
-                model_path, torch_dtype
-            )
+            if "gme-qwen2-vl" in model_path.lower():
+                self.model = AutoModelForVision2Seq.from_pretrained(
+                    model_path,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=False,
+                    low_cpu_mem_usage=True,
+                ).cuda()
+                self.processor = AutoProcessor.from_pretrained(model_path)
+            else:
+                self.model = _get_sentence_transformer_embedding_model(
+                    model_path, torch_dtype
+                )
         elif self.model_type == "reward":
             from transformers import AutoModelForSequenceClassification
@@ -169,7 +248,9 @@ class HFRunner:
         # Run forward
         while True:
-            prompts, max_new_tokens, lora_paths, token_ids_logprob = in_queue.get()
+            prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob = (
+                in_queue.get()
+            )
             if lora_paths is not None:
                 assert len(prompts) == len(lora_paths)
@@ -189,7 +270,10 @@ class HFRunner:
                     )
                 elif self.model_type == "embedding":
                     assert not self.output_str_only
-                    logits = self.model.encode(prompts).tolist()
+                    if "gme-qwen2-vl" in model_path.lower():
+                        logits = self._get_gme_qwen2_vl_embeddings(prompts, image_data)
+                    else:
+                        logits = self.model.encode(prompts).tolist()
                     out_queue.put(ModelOutput(embed_logits=logits))
                 elif self.model_type == "reward":
@@ -211,11 +295,14 @@ class HFRunner:
     def forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
         lora_paths: Optional[List[str]] = None,
         token_ids_logprob: Optional[int] = None,
     ):
-        self.in_queue.put((prompts, max_new_tokens, lora_paths, token_ids_logprob))
+        self.in_queue.put(
+            (prompts, image_data, max_new_tokens, lora_paths, token_ids_logprob)
+        )
         return self.out_queue.get()
     def terminate(self):
@@ -396,6 +483,7 @@ class SRTRunner:
     def forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
         max_new_tokens: int = 8,
         lora_paths: Optional[List[str]] = None,
         logprob_start_len: int = 0,
@@ -413,17 +501,23 @@ class SRTRunner:
                 token_ids_logprob=token_ids_logprob,
             )
         else:
-            response = self.engine.encode(prompts)
             if self.model_type == "embedding":
-                logits = [x["embedding"] for x in response]
+                response = self.engine.encode(prompt=prompts, image_data=image_data)
+                if isinstance(response, list):
+                    logits = [x["embedding"] for x in response]
+                else:
+                    logits = [response["embedding"]]
                 return ModelOutput(embed_logits=logits)
+            # reward model
             else:
+                response = self.engine.encode(prompts)
                 scores = [x["embedding"][0] for x in response]
                 return ModelOutput(scores=scores)
     def batch_forward(
         self,
         prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
+        image_data: Optional[List[str]] = None,
         max_new_tokens=8,
         lora_paths=None,
     ):
@@ -439,7 +533,7 @@ class SRTRunner:
                 lora_paths=lora_paths,
             )
         else:
-            response = self.engine.encode(prompts)
+            response = self.engine.encode(prompts, image_data)
             if self.model_type == "embedding":
                 logits = [x["embedding"] for x in response]
                 return ModelOutput(embed_logits=logits)

sglang/test/test_block_fp8.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import itertools
+import os
 import unittest
 import torch
@@ -7,9 +8,12 @@ from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
 from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
+    static_quant_fp8,
     w8a8_block_fp8_matmul,
 )
+_is_cuda = torch.cuda.is_available() and torch.version.cuda
 # For test
 def native_per_token_group_quant_fp8(
@@ -63,7 +67,7 @@ class TestPerTokenGroupQuantFP8(unittest.TestCase):
             out, scale = per_token_group_quant_fp8(x, group_size)
         self.assertTrue(
-            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
+            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.20)
         )
         self.assertTrue(torch.allclose(scale, ref_scale))
@@ -85,6 +89,71 @@ class TestPerTokenGroupQuantFP8(unittest.TestCase):
                 self._per_token_group_quant_fp8(*params)
+# For test
+def native_static_quant_fp8(x, x_s, dtype=torch.float8_e4m3fn):
+    """Function to perform static quantization on an input tensor `x` using native torch.
+    It converts the tensor values into float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    """
+    assert x.is_contiguous(), "`x` is not contiguous"
+    assert x_s.numel() == 1, "only supports per-tensor scale"
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+    x_ = x.reshape(x.numel() // x.shape[-1], x.shape[-1])
+    x_s_inv = 1.0 / x_s
+    x_q = (x_ * x_s_inv).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    return x_q, x_s
+class TestStaticQuantFP8(unittest.TestCase):
+    DTYPES = [torch.half, torch.bfloat16, torch.float32]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _static_quant_fp8(self, num_tokens, d, dtype, seed):
+        torch.manual_seed(seed)
+        x = torch.rand(num_tokens, d, dtype=dtype)
+        fp8_max = torch.finfo(torch.float8_e4m3fn).max
+        x_s = x.max() / fp8_max
+        with torch.inference_mode():
+            ref_out, _ = native_static_quant_fp8(x, x_s)
+            out, _ = static_quant_fp8(x, x_s, repeat_scale=True)
+        self.assertTrue(
+            torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.50)
+        )
+    def test_static_quant_fp8(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                seed=params[3],
+            ):
+                self._static_quant_fp8(*params)
 # For test
 def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
     """This function performs matrix multiplication with block-wise quantization using native torch.
@@ -142,13 +211,35 @@ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.fl
 class TestW8A8BlockFP8Matmul(unittest.TestCase):
-    OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
-    M = [1, 7, 83, 512, 2048]
-    N = [128, 512, 1024, 4096, 7748, 13824]
-    K = [256, 4096, 5120, 3884, 13824]
-    # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
-    BLOCK_SIZE = [[128, 128]]
-    SEEDS = [0]
+    if not _is_cuda:
+        OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
+        M = [1, 7, 83, 512, 2048]
+        NKs = [
+            (N, K)
+            for N in [128, 512, 1024, 4096, 7748, 13824]
+            for K in [256, 4096, 5120, 3884, 13824]
+        ]
+        # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
+    else:
+        # use practical shape in DeepSeek V3 for test
+        OUT_DTYPES = [torch.bfloat16]
+        M = [64, 128, 512, 1024, 4096]
+        NKs = [
+            (1536, 7168),
+            (3072, 1536),
+            (24576, 7168),
+            (4096, 512),
+            (7168, 2048),
+            (4608, 7168),
+            (512, 7168),
+            (7168, 2304),
+            (7168, 512),
+        ]
+        BLOCK_SIZE = [[128, 128]]
+        SEEDS = [0]
     @classmethod
     def setUpClass(cls):
@@ -156,7 +247,8 @@ class TestW8A8BlockFP8Matmul(unittest.TestCase):
             raise unittest.SkipTest("CUDA is not available")
         torch.set_default_device("cuda")
-    def _w8a8_block_fp8_matmul(self, M, N, K, block_size, out_dtype, seed):
+    def _w8a8_block_fp8_matmul(self, M, NK, block_size, out_dtype, seed):
+        N, K = NK
         torch.manual_seed(seed)
         # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
         factor_for_scale = 1e-2
@@ -191,19 +283,17 @@ class TestW8A8BlockFP8Matmul(unittest.TestCase):
     def test_w8a8_block_fp8_matmul(self):
         for params in itertools.product(
             self.M,
-            self.N,
-            self.K,
+            self.NKs,
             self.BLOCK_SIZE,
             self.OUT_DTYPES,
             self.SEEDS,
         ):
             with self.subTest(
                 M=params[0],
-                N=params[1],
-                K=params[2],
-                block_size=params[3],
-                out_dtype=params[4],
-                seed=params[5],
+                NKs=params[1],
+                block_size=params[2],
+                out_dtype=params[3],
+                seed=params[4],
             ):
                 self._w8a8_block_fp8_matmul(*params)

sglang/test/test_custom_ops.py ADDED Viewed

@@ -0,0 +1,88 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/8ca7a71df787ad711ad3ac70a5bd2eb2bb398938/tests/quantization/test_fp8.py
+import pytest
+import torch
+from sglang.srt.custom_op import scaled_fp8_quant
+from sglang.srt.utils import is_cuda
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_scaled_fp8_quant_per_tensor(dtype) -> None:
+    def quantize_ref_per_tensor(tensor, inv_scale):
+        # The reference implementation that fully aligns to
+        # the kernel being tested.
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        scale = inv_scale.reciprocal()
+        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
+        qweight = qweight.to(torch.float8_e4m3fn)
+        return qweight
+    def dequantize_per_tensor(tensor, inv_scale, dtype):
+        fake_qweight = tensor.to(dtype)
+        dq_weight = fake_qweight * inv_scale
+        return dq_weight
+    # Note that we use a shape % 8 != 0 to cover edge cases,
+    # because scaled_fp8_quant is vectorized by 8.
+    x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
+    # Test Per Tensor Dynamic quantization
+    # scale = max(abs(x)) / FP8_E4M3_MAX
+    y, scale = scaled_fp8_quant(x, None)
+    ref_y = quantize_ref_per_tensor(x, scale)
+    torch.testing.assert_close(y, ref_y)
+    torch.testing.assert_close(
+        dequantize_per_tensor(y, scale, dtype),
+        dequantize_per_tensor(ref_y, scale, dtype),
+    )
+    # Test Per Tensor Static quantization
+    y, _ = scaled_fp8_quant(x, scale)
+    ref_y = quantize_ref_per_tensor(x, scale)
+    torch.testing.assert_close(y, ref_y)
+    torch.testing.assert_close(
+        dequantize_per_tensor(y, scale, dtype),
+        dequantize_per_tensor(ref_y, scale, dtype),
+    )
+if is_cuda:
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_scaled_fp8_quant_per_token_dynamic(dtype) -> None:
+        def quantize_ref_per_token(tensor, inv_scale):
+            # The reference implementation that fully aligns to
+            # the kernel being tested.
+            finfo = torch.finfo(torch.float8_e4m3fn)
+            scale = inv_scale.reciprocal()
+            qweight = (tensor.to(torch.float32) * scale).clamp(
+                min=finfo.min, max=finfo.max
+            )
+            qweight = qweight.to(torch.float8_e4m3fn)
+            return qweight
+        def dequantize_per_token(tensor, inv_scale, dtype):
+            fake_qweight = tensor.to(dtype)
+            dq_weight = fake_qweight * inv_scale
+            return dq_weight
+        # Note that we use a shape % 8 = 0,
+        # because per_token_quant_fp8 is vectorized by 8 elements.
+        x = (torch.randn(size=(11, 16), device="cuda") * 13).to(dtype)
+        # Test Per Tensor Dynamic quantization
+        # scale = max(abs(x)) / FP8_E4M3_MAX
+        y, scale = scaled_fp8_quant(x, None, use_per_token_if_dynamic=True)
+        ref_y = quantize_ref_per_token(x, scale)
+        torch.testing.assert_close(y, ref_y)
+        torch.testing.assert_close(
+            dequantize_per_token(y, scale, dtype),
+            dequantize_per_token(ref_y, scale, dtype),
+        )
+if __name__ == "__main__":
+    # Run the specific test function directly
+    pytest.main([__file__])

sglang/test/test_utils.py CHANGED Viewed

@@ -28,6 +28,10 @@ from sglang.test.run_eval import run_eval
 from sglang.utils import get_exception_traceback
 DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
+DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
+DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
+)
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
@@ -36,12 +40,15 @@ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instru
 DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
+    "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
+)
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
-DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
 DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
 DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
@@ -446,22 +453,31 @@ def run_with_timeout(
     return ret_value[0]
-def run_unittest_files(files: List[str], timeout_per_file: float):
+def run_unittest_files(files: List, timeout_per_file: float):
     tic = time.time()
     success = True
-    for filename in files:
+    for file in files:
+        filename, estimated_time = file.name, file.estimated_time
         process = None
         def run_one_file(filename):
             nonlocal process
             filename = os.path.join(os.getcwd(), filename)
-            print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
+            print(f".\n.\nBegin:\npython3 {filename}\n.\n.\n", flush=True)
+            tic = time.time()
             process = subprocess.Popen(
                 ["python3", filename], stdout=None, stderr=None, env=os.environ
             )
             process.wait()
+            elapsed = time.time() - tic
+            print(
+                f".\n.\nEnd:\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
+                flush=True,
+            )
             return process.returncode
         try:

sglang/utils.py CHANGED Viewed

@@ -24,14 +24,10 @@ import requests
 from IPython.display import HTML, display
 from tqdm import tqdm
-from sglang.srt.openai_api.protocol import ChatCompletionMessageContentPart
 from sglang.srt.utils import kill_process_tree
 logger = logging.getLogger(__name__)
-# type of content fields, can be only prompts or with images/videos
-MsgContent = Union[str, List[ChatCompletionMessageContentPart]]
 def get_exception_traceback():
     etype, value, tb = sys.exc_info()

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.3.~~post4~~"
1	+ __version__ = "0.4.4.post1"

{sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: sglang
-Version: 0.4.3.post4
+Version: 0.4.4.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -211,19 +211,22 @@ Classifier: License :: OSI Approved :: Apache Software License
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: aiohttp
 Requires-Dist: requests
 Requires-Dist: tqdm
 Requires-Dist: numpy
 Requires-Dist: IPython
 Requires-Dist: setproctitle
 Provides-Extra: runtime-common
-Requires-Dist: aiohttp; extra == "runtime-common"
+Requires-Dist: datasets; extra == "runtime-common"
 Requires-Dist: decord; extra == "runtime-common"
 Requires-Dist: fastapi; extra == "runtime-common"
 Requires-Dist: hf_transfer; extra == "runtime-common"
 Requires-Dist: huggingface_hub; extra == "runtime-common"
 Requires-Dist: interegular; extra == "runtime-common"
+Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
 Requires-Dist: modelscope; extra == "runtime-common"
+Requires-Dist: ninja; extra == "runtime-common"
 Requires-Dist: orjson; extra == "runtime-common"
 Requires-Dist: packaging; extra == "runtime-common"
 Requires-Dist: pillow; extra == "runtime-common"
@@ -233,24 +236,20 @@ Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
+Requires-Dist: transformers==4.48.3; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.14; extra == "runtime-common"
-Requires-Dist: ninja; extra == "runtime-common"
-Requires-Dist: transformers==4.48.3; extra == "runtime-common"
-Requires-Dist: llguidance>=0.6.15; extra == "runtime-common"
-Requires-Dist: datasets; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.15; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.2.post1; extra == "srt"
+Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
+Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
 Requires-Dist: torch==2.5.1; extra == "srt"
 Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
-Requires-Dist: sgl-kernel==0.0.3.post6; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
 Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
 Requires-Dist: outlines==0.1.11; extra == "srt-hip"

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl