PyPI - sglang - Versions diffs - 0.4.0.post2__tar.gz → 0.4.1__tar.gz - Mend

sglang 0.4.0.post2tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

{sglang-0.4.0.post2 → sglang-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.4.0.post2
+Version: 0.4.1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -234,7 +234,6 @@ Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
-Requires-Dist: gemlite; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
@@ -244,6 +243,7 @@ Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: flashinfer==0.1.6; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -358,7 +358,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
+The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
 ## Acknowledgment and Citation
 We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.4.0.post2 → sglang-0.4.1}/README.md RENAMED Viewed

@@ -57,7 +57,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
+The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
 ## Acknowledgment and Citation
 We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).

{sglang-0.4.0.post2 → sglang-0.4.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.0.post2"
+version = "0.4.1"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -21,9 +21,9 @@ runtime_common = ["aiohttp", "decord", "fastapi",
     "orjson", "outlines>=0.0.44,<0.1.0",
     "packaging", "pillow", "prometheus-client>=0.20.0",
     "psutil", "pydantic", "python-multipart",
-    "pyzmq>=25.1.2", "torchao>=0.7.0", "gemlite", "uvicorn", "uvloop",
+    "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
     "xgrammar>=0.1.6"]
-srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6"]
+srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post8"]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl

{sglang-0.4.0.post2 → sglang-0.4.1}/sglang/bench_offline_throughput.py RENAMED Viewed

@@ -322,18 +322,6 @@ def throughput_test(
         )
         time.sleep(0.5)
-    try:
-        import os
-        import pwd
-        from gemlite.core import GemLiteLinearTriton
-        GemLiteLinearTriton.cache_config(
-            f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
-        )
-    except ImportError:
-        pass
     logging.info("\nBenchmark...")
     result = throughput_test_once(
         backend_name=bench_args.backend,

{sglang-0.4.0.post2 → sglang-0.4.1}/sglang/bench_one_batch.py RENAMED Viewed

@@ -386,18 +386,6 @@ def latency_test(
         server_args.device,
     )
-    try:
-        import os
-        import pwd
-        from gemlite.core import GemLiteLinearTriton
-        GemLiteLinearTriton.cache_config(
-            f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json"
-        )
-    except ImportError:
-        pass
     rank_print("Benchmark ...")
     # Run the sweep

{sglang-0.4.0.post2 → sglang-0.4.1}/sglang/bench_serving.py RENAMED Viewed

@@ -924,6 +924,7 @@ async def benchmark(
             f"are correctly specified. Error: {test_output.error}"
         )
     else:
+        requests.post(base_url + "/flush_cache")
         print("Initial test run completed. Starting main benchmark run...")
     time.sleep(1.5)

sglang-0.4.1/sglang/srt/aio_rwlock.py ADDED Viewed

@@ -0,0 +1,100 @@
+import asyncio
+class RWLock:
+    def __init__(self):
+        # Protects internal state
+        self._lock = asyncio.Lock()
+        # Condition variable used to wait for state changes
+        self._cond = asyncio.Condition(self._lock)
+        # Number of readers currently holding the lock
+        self._readers = 0
+        # Whether a writer is currently holding the lock
+        self._writer_active = False
+        # How many writers are queued waiting for a turn
+        self._waiting_writers = 0
+    @property
+    def reader_lock(self):
+        """
+        A context manager for acquiring a shared (reader) lock.
+        Example:
+            async with rwlock.reader_lock:
+                # read-only access
+        """
+        return _ReaderLock(self)
+    @property
+    def writer_lock(self):
+        """
+        A context manager for acquiring an exclusive (writer) lock.
+        Example:
+            async with rwlock.writer_lock:
+                # exclusive access
+        """
+        return _WriterLock(self)
+    async def acquire_reader(self):
+        async with self._lock:
+            # Wait until there is no active writer or waiting writer
+            # to ensure fairness.
+            while self._writer_active or self._waiting_writers > 0:
+                await self._cond.wait()
+            self._readers += 1
+    async def release_reader(self):
+        async with self._lock:
+            self._readers -= 1
+            # If this was the last reader, wake up anyone waiting
+            # (potentially a writer or new readers).
+            if self._readers == 0:
+                self._cond.notify_all()
+    async def acquire_writer(self):
+        async with self._lock:
+            # Increment the count of writers waiting
+            self._waiting_writers += 1
+            try:
+                # Wait while either a writer is active or readers are present
+                while self._writer_active or self._readers > 0:
+                    await self._cond.wait()
+                self._writer_active = True
+            finally:
+                # Decrement waiting writers only after we've acquired the writer lock
+                self._waiting_writers -= 1
+    async def release_writer(self):
+        async with self._lock:
+            self._writer_active = False
+            # Wake up anyone waiting (readers or writers)
+            self._cond.notify_all()
+class _ReaderLock:
+    def __init__(self, rwlock: RWLock):
+        self._rwlock = rwlock
+    async def __aenter__(self):
+        await self._rwlock.acquire_reader()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self._rwlock.release_reader()
+class _WriterLock:
+    def __init__(self, rwlock: RWLock):
+        self._rwlock = rwlock
+    async def __aenter__(self):
+        await self._rwlock.acquire_writer()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self._rwlock.release_writer()

{sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -94,7 +94,10 @@ class ModelConfig:
         )
         # FIXME: temporary special judge for MLA architecture
-        if "DeepseekV2ForCausalLM" in self.hf_config.architectures:
+        if (
+            "DeepseekV2ForCausalLM" in self.hf_config.architectures
+            or "DeepseekV3ForCausalLM" in self.hf_config.architectures
+        ):
             self.head_dim = 256
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_config.kv_lora_rank
@@ -124,8 +127,12 @@ class ModelConfig:
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
         self.vocab_size = self.hf_text_config.vocab_size
+        # Veirfy quantization
         self._verify_quantization()
+        # Multimodel attrs
+        self.image_token_id = getattr(self.hf_config, "image_token_id", None)
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""

{sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/attention/flashinfer_backend.py RENAMED Viewed

@@ -18,11 +18,7 @@ import triton.language as tl
 from sglang.global_config import global_config
 from sglang.srt.layers.attention import AttentionBackend
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import (
-    get_bool_env_var,
-    is_flashinfer_available,
-    should_use_tensor_core,
-)
+from sglang.srt.utils import is_flashinfer_available
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
@@ -731,3 +727,51 @@ def create_flashinfer_kv_indices_triton(
             mask=mask,
         )
         tl.store(kv_indices_ptr + kv_indices_offset + offset, data, mask=mask)
+def should_use_tensor_core(
+    kv_cache_dtype: torch.dtype,
+    num_attention_heads: int,
+    num_kv_heads: int,
+) -> bool:
+    """
+    Determine whether to use tensor cores for attention computation.
+    Args:
+        kv_cache_dtype: Data type of the KV cache
+        num_attention_heads: Number of attention heads
+        num_kv_heads: Number of key/value heads
+    Returns:
+        bool: Whether to use tensor cores
+    """
+    # Try to use environment variable first
+    env_override = os.environ.get("SGLANG_FLASHINFER_USE_TENSOR_CORE")
+    if env_override is not None:
+        return env_override.lower() == "true"
+    # Try to use _grouped_size_compiled_for_decode_kernels if available
+    # This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
+    try:
+        from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
+        if not _grouped_size_compiled_for_decode_kernels(
+            num_attention_heads,
+            num_kv_heads,
+        ):
+            return True
+        else:
+            return False
+    except (ImportError, AttributeError):
+        pass
+    # Calculate GQA group size
+    gqa_group_size = num_attention_heads // num_kv_heads
+    # Determine based on dtype and GQA group size
+    if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+        return True
+    elif kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16):
+        return gqa_group_size > 4
+    else:
+        return False

{sglang-0.4.0.post2 → sglang-0.4.1}/sglang/srt/layers/linear.py RENAMED Viewed

@@ -30,6 +30,7 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8_utils import BlockQuantScaleParameter
 from sglang.srt.utils import set_weight_attrs
 logger = logging.getLogger(__name__)
@@ -628,8 +629,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         assert loaded_shard_id < len(self.output_sizes)
         tp_size = get_tensor_model_parallel_world_size()
-        shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-        shard_size = self.output_sizes[loaded_shard_id] // tp_size
+        if isinstance(param, BlockQuantScaleParameter):
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (
+                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
+            ) // tp_size
+            shard_size = (
+                (self.output_sizes[loaded_shard_id] + block_n - 1) // block_n // tp_size
+            )
+        else:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // tp_size
         param.load_merged_column_weight(
             loaded_weight=loaded_weight,
@@ -795,6 +807,12 @@ class QKVParallelLinear(ColumnParallelLinear):
         shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
         shard_size = self._get_shard_size_mapping(loaded_shard_id)
+        if isinstance(param, BlockQuantScaleParameter):
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            block_n, _ = weight_block_size[0], weight_block_size[1]
+            shard_offset = (shard_offset + block_n - 1) // block_n
+            shard_size = (shard_size + block_n - 1) // block_n
         param.load_qkv_weight(
             loaded_weight=loaded_weight,
             num_heads=self.num_kv_head_replicas,

{sglang-0.4.0.post2/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/ep_moe/layer.py RENAMED Viewed

@@ -12,15 +12,15 @@ from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
 from sglang.srt.layers.custom_op_util import register_custom_op
-from sglang.srt.layers.ep_moe.kernels import (
+from sglang.srt.layers.moe.ep_moe.kernels import (
     grouped_gemm_triton,
     post_reorder_triton_kernel,
     pre_reorder_triton_kernel,
     run_moe_ep_preproess,
     silu_and_mul_triton_kernel,
 )
-from sglang.srt.layers.fused_moe_triton.fused_moe import fused_topk, grouped_topk
-from sglang.srt.layers.fused_moe_triton.layer import FusedMoEMethodBase
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoEMethodBase
+from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
@@ -113,6 +113,7 @@ class EPMoE(torch.nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         prefix: str = "",
+        correction_bias: Optional[torch.Tensor] = None,
     ):
         super().__init__()
@@ -138,6 +139,7 @@ class EPMoE(torch.nn.Module):
             assert num_expert_group is not None and topk_group is not None
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
+        self.correction_bias = correction_bias
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = UnquantizedEPMoEMethod()
@@ -170,13 +172,15 @@ class EPMoE(torch.nn.Module):
                 hidden_states.device, use_flashinfer=False  # TODO: use flashinfer
             )
-        topk_weights, topk_ids = self.select_experts(
-            hidden_states,
-            router_logits,
-            self.top_k,
-            self.renormalize,
-            self.topk_group,
-            self.num_expert_group,
+        topk_weights, topk_ids = select_experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            top_k=self.top_k,
+            use_grouped_topk=self.use_grouped_topk,
+            renormalize=self.renormalize,
+            topk_group=self.topk_group,
+            num_expert_group=self.num_expert_group,
+            correction_bias=self.correction_bias,
         )
         reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
@@ -297,35 +301,6 @@ class EPMoE(torch.nn.Module):
         )
         return output
-    def select_experts(
-        self,
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-    ):
-        if self.use_grouped_topk:
-            assert topk_group is not None
-            assert num_expert_group is not None
-            topk_weights, topk_ids = grouped_topk(
-                hidden_states=hidden_states,
-                gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-            )
-        else:
-            topk_weights, topk_ids = fused_topk(
-                hidden_states=hidden_states,
-                gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-            )
-        return topk_weights, topk_ids.to(torch.int32)
     @classmethod
     def make_expert_params_mapping(
         cls,

sglang-0.4.1/sglang/srt/layers/moe/fused_moe_native.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""
+Torch-native implementation for FusedMoE. This is used for torch.compile.
+It is based on https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/mixtral-moe/model.py#L204
+"""
+from typing import Callable, Optional
+import torch
+from torch.nn import functional as F
+from sglang.srt.layers.moe.topk import select_experts
+def fused_moe_forward_native(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    use_grouped_topk: bool,
+    top_k: int,
+    router_logits: torch.Tensor,
+    renormalize: bool,
+    topk_group: Optional[int] = None,
+    num_expert_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    correction_bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    topk_weights, topk_ids = select_experts(
+        hidden_states=x,
+        router_logits=router_logits,
+        use_grouped_topk=use_grouped_topk,
+        top_k=top_k,
+        renormalize=renormalize,
+        topk_group=topk_group,
+        num_expert_group=num_expert_group,
+        custom_routing_function=custom_routing_function,
+        correction_bias=correction_bias,
+        torch_native=True,
+    )
+    w13_weights = layer.w13_weight[topk_ids]
+    w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
+    w2_weights = layer.w2_weight[topk_ids]
+    x1 = torch.einsum("ti,taoi -> tao", x, w1_weights)
+    x1 = F.silu(x1)
+    x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
+    expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
+    return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))

{sglang-0.4.0.post2/sglang/srt/layers → sglang-0.4.1/sglang/srt/layers/moe}/fused_moe_triton/__init__.py RENAMED Viewed

@@ -1,14 +1,12 @@
 from contextlib import contextmanager
 from typing import Any, Dict, Optional
-import sglang.srt.layers.fused_moe_triton.fused_moe  # noqa
-from sglang.srt.layers.fused_moe_triton.fused_moe import (
+import sglang.srt.layers.moe.fused_moe_triton.fused_moe  # noqa
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
     fused_experts,
-    fused_topk,
     get_config_file_name,
-    grouped_topk,
 )
-from sglang.srt.layers.fused_moe_triton.layer import (
+from sglang.srt.layers.moe.fused_moe_triton.layer import (
     FusedMoE,
     FusedMoEMethodBase,
     FusedMoeWeightScaleSupported,
@@ -37,8 +35,6 @@ __all__ = [
     "override_config",
     "get_config",
     "fused_moe",
-    "fused_topk",
     "fused_experts",
     "get_config_file_name",
-    "grouped_topk",
 ]

sglang 0.4.0.post2__tar.gz → 0.4.1__tar.gz

sglang 0.4.0.post2tar.gz → 0.4.1tar.gz