PyPI - sglang - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.9py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

sglang/bench_serving.py +2 -2
sglang/srt/configs/model_config.py +12 -1
sglang/srt/conversation.py +35 -1
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/layers/communicator.py +3 -1
sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
sglang/srt/layers/layernorm.py +2 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +58 -0
sglang/srt/layers/moe/ep_moe/layer.py +140 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +135 -58
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/fp8.py +28 -7
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/vocab_parallel_embedding.py +9 -3
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/io_struct.py +8 -1
sglang/srt/managers/mm_utils.py +4 -2
sglang/srt/managers/schedule_batch.py +1 -1
sglang/srt/managers/scheduler.py +17 -5
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +113 -63
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/models/deepseek_v2.py +16 -2
sglang/srt/models/mllama4.py +360 -79
sglang/srt/multimodal/mm_utils.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +62 -60
sglang/srt/server_args.py +15 -0
sglang/srt/two_batch_overlap.py +3 -0
sglang/srt/utils.py +37 -17
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +4 -3
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +47 -43
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -217,11 +217,13 @@ class ServerArgs:
     hicache_ratio: float = 2.0
     hicache_size: int = 0
     hicache_write_policy: str = "write_through_selective"
+    hicache_io_backend: str = ""
     flashinfer_mla_disable_ragged: bool = False
     disable_shared_experts_fusion: bool = False
     disable_chunked_prefix_cache: bool = False
     disable_fast_image_processor: bool = False
     enable_return_hidden_states: bool = False
+    enable_triton_kernel_moe: bool = False
     warmups: Optional[str] = None
     # Debug tensor dumps
@@ -706,6 +708,7 @@ class ServerArgs:
                 "w8a8_fp8",
                 "moe_wna16",
                 "qoq",
+                "w4afp8",
             ],
             help="The quantization method.",
         )
@@ -1529,6 +1532,13 @@ class ServerArgs:
             default=ServerArgs.hicache_write_policy,
             help="The write policy of hierarchical cache.",
         )
+        parser.add_argument(
+            "--hicache-io-backend",
+            type=str,
+            choices=["direct", "kernel"],
+            default=ServerArgs.hicache_io_backend,
+            help="The IO backend for KV cache transfer between CPU and GPU",
+        )
         parser.add_argument(
             "--flashinfer-mla-disable-ragged",
             action="store_true",
@@ -1554,6 +1564,11 @@ class ServerArgs:
             action="store_true",
             help="Enable returning hidden states with responses.",
         )
+        parser.add_argument(
+            "--enable-triton-kernel-moe",
+            action="store_true",
+            help="Use triton moe grouped gemm kernel.",
+        )
         parser.add_argument(
             "--warmups",
             type=str,

sglang/srt/two_batch_overlap.py CHANGED Viewed

@@ -490,6 +490,7 @@ class TboForwardBatchPreparer:
         output_dict["spec_info"] = output_spec_info
         for key in [
             "forward_mode",
+            "is_extend_in_batch",
             "return_logprob",
             "req_to_token_pool",
             "token_to_kv_pool",
@@ -550,6 +551,8 @@ class TboForwardBatchPreparer:
                 top_p_normalized_logprobs=False,
                 top_p=None,
                 mm_inputs=None,
+                top_logprobs_nums=None,
+                token_ids_logprobs=None,
             )
         )

sglang/srt/utils.py CHANGED Viewed

@@ -15,7 +15,6 @@
 from __future__ import annotations
-import base64
 import builtins
 import ctypes
 import dataclasses
@@ -68,6 +67,7 @@ from typing import (
 import numpy as np
 import psutil
+import pybase64
 import requests
 import torch
 import torch.distributed
@@ -83,12 +83,7 @@ from torch.func import functional_call
 from torch.library import Library
 from torch.profiler import ProfilerActivity, profile, record_function
 from torch.utils._contextlib import _DecoratorContextManager
-from triton.runtime.cache import (
-    FileCacheManager,
-    default_cache_dir,
-    default_dump_dir,
-    default_override_dir,
-)
+from triton.runtime.cache import FileCacheManager
 logger = logging.getLogger(__name__)
@@ -621,7 +616,7 @@ def decode_video_base64(video_base64):
     from PIL import Image
     # Decode the base64 string
-    video_bytes = base64.b64decode(video_base64)
+    video_bytes = pybase64.b64decode(video_base64, validate=True)
     # Placeholder for the start indices of each PNG image
     img_starts = []
@@ -707,7 +702,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
         audio, original_sr = sf.read(BytesIO(audio_file))
     elif audio_file.startswith("data:"):
         audio_file = audio_file.split(",")[1]
-        audio, original_sr = sf.read(BytesIO(base64.b64decode(audio_file)))
+        audio, original_sr = sf.read(
+            BytesIO(pybase64.b64decode(audio_file, validate=True))
+        )
     elif audio_file.startswith("http://") or audio_file.startswith("https://"):
         timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
         response = requests.get(audio_file, stream=True, timeout=timeout)
@@ -776,12 +773,12 @@ def load_image(
         image = Image.open(image_file)
     elif image_file.startswith("data:"):
         image_file = image_file.split(",")[1]
-        image = Image.open(BytesIO(base64.b64decode(image_file)))
+        image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
     elif image_file.startswith("video:"):
         image_file = image_file.replace("video:", "")
         image, image_size = decode_video_base64(image_file)
     elif isinstance(image_file, str):
-        image = Image.open(BytesIO(base64.b64decode(image_file)))
+        image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
     else:
         raise ValueError(f"Invalid image: {image}")
@@ -923,18 +920,41 @@ class CustomCacheManager(FileCacheManager):
         self.key = key
         self.lock_path = None
+        try:
+            module_path = "triton.runtime.cache"
+            cache_module = importlib.import_module(module_path)
+            default_cache_dir = getattr(cache_module, "default_cache_dir", None)
+            default_dump_dir = getattr(cache_module, "default_dump_dir", None)
+            default_override_dir = getattr(cache_module, "default_override_dir", None)
+        except (ModuleNotFoundError, AttributeError) as e:
+            default_cache_dir = None
+            default_dump_dir = None
+            default_override_dir = None
         if dump:
-            self.cache_dir = default_dump_dir()
+            self.cache_dir = (
+                default_dump_dir()
+                if default_dump_dir is not None
+                else os.path.join(Path.home(), ".triton", "dump")
+            )
             self.cache_dir = os.path.join(self.cache_dir, self.key)
             self.lock_path = os.path.join(self.cache_dir, "lock")
             os.makedirs(self.cache_dir, exist_ok=True)
         elif override:
-            self.cache_dir = default_override_dir()
+            self.cache_dir = (
+                default_override_dir()
+                if default_override_dir is not None
+                else os.path.join(Path.home(), ".triton", "override")
+            )
             self.cache_dir = os.path.join(self.cache_dir, self.key)
         else:
             # create cache directory if it doesn't exist
-            self.cache_dir = (
-                os.getenv("TRITON_CACHE_DIR", "").strip() or default_cache_dir()
+            self.cache_dir = os.getenv("TRITON_CACHE_DIR", "").strip() or (
+                default_cache_dir()
+                if default_cache_dir is not None
+                else os.path.join(Path.home(), ".triton", "cache")
             )
             if self.cache_dir:
                 try:
@@ -1848,7 +1868,7 @@ class MultiprocessingSerializer:
         if output_str:
             # Convert bytes to base64-encoded string
-            output = base64.b64encode(output).decode("utf-8")
+            output = pybase64.b64encode(output).decode("utf-8")
         return output
@@ -1865,7 +1885,7 @@ class MultiprocessingSerializer:
         """
         if isinstance(data, str):
             # Decode base64 string to bytes
-            data = base64.b64decode(data)
+            data = pybase64.b64decode(data, validate=True)
         return ForkingPickler.loads(data)

sglang/test/test_cutlass_w4a8_moe.py ADDED Viewed

@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+import pytest
+import torch
+from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
+from sglang.srt.layers.moe.topk import select_experts
+def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Tensor:
+    if int4_values_interleaved.shape[-1] % 2 != 0:
+        raise ValueError(
+            "the last dim size of int4_values_interleaved tensor must be even."
+        )
+    input_tensor_int8 = int4_values_interleaved.to(torch.int8)
+    low_nibbles = input_tensor_int8[..., 0::2]
+    high_nibbles = input_tensor_int8[..., 1::2]
+    packed_tensor = (high_nibbles << 4) | (low_nibbles & 0x0F)
+    return packed_tensor.to(torch.int8)
+def pack_interleave(num_experts, ref_weight, ref_scale):
+    n, k = ref_weight.shape[1], ref_weight.shape[2]
+    weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
+    w_q = weight.view((num_experts, n, k // 2)).view(torch.int8)
+    w_q = w_q.contiguous()
+    scale_interleaved = ref_scale.reshape(
+        ref_scale.shape[0], ref_scale.shape[1], (ref_scale.shape[2] // 4), 4
+    )  # [E, N, K/4, 4]
+    scale_interleaved = scale_interleaved.permute(0, 2, 1, 3)  # [E, K/4, N, 4]
+    scale_interleaved = scale_interleaved.reshape(
+        ref_scale.shape[0], ref_scale.shape[2] // 4, ref_scale.shape[1] * 4
+    )  # [E, K/4, N*4]
+    w_scale = scale_interleaved.contiguous()
+    return w_q, w_scale
+@pytest.mark.parametrize("M", [1, 2, 4, 8, 16])
+@pytest.mark.parametrize("N", [2048])
+@pytest.mark.parametrize("K", [7168])
+@pytest.mark.parametrize("E", [256])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.parametrize("topk", [8])
+@pytest.mark.parametrize("group_size", [128])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
+    local_e = E // ep_size
+    debug = False
+    if debug:
+        a = torch.ones((M, K), dtype=dtype, device="cuda") * 0.001
+        ref_weight_1 = torch.ones((local_e, N * 2, K), dtype=torch.int8, device="cuda")
+        ref_weight_2 = torch.ones((local_e, K, N), dtype=torch.int8, device="cuda")
+        a1_scale = torch.ones(1, dtype=torch.float32, device="cuda")
+        a2_scale = torch.ones(1, dtype=torch.float32, device="cuda")
+        scale_1 = torch.ones(
+            (local_e, N * 2, K // group_size), dtype=dtype, device="cuda"
+        )
+        scale_2 = torch.ones((local_e, K, N // group_size), dtype=dtype, device="cuda")
+    else:
+        a = torch.randn(M, K, dtype=dtype, device="cuda")
+        ref_weight_1 = torch.randint(
+            -8, 8, (local_e, N * 2, K), dtype=torch.int8, device="cuda"
+        )
+        ref_weight_2 = torch.randint(
+            -8, 8, (local_e, K, N), dtype=torch.int8, device="cuda"
+        )
+        affine_coeff = 0.005
+        a1_scale = torch.randn(1, dtype=torch.float32, device="cuda")
+        a2_scale = torch.randn(1, dtype=torch.float32, device="cuda")
+        scale_1 = (
+            torch.randn(local_e, N * 2, K // group_size, dtype=dtype, device="cuda")
+            * affine_coeff
+        )
+        scale_2 = (
+            torch.randn(local_e, K, N // group_size, dtype=dtype, device="cuda")
+            * affine_coeff
+        )
+    w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
+    w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
+    device = "cuda"
+    a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
+    c_strides1 = torch.full((local_e, 3), 2 * N, device=device, dtype=torch.int64)
+    a_strides2 = torch.full((local_e, 3), N, device=device, dtype=torch.int64)
+    c_strides2 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
+    b_strides1 = a_strides1
+    s_strides13 = c_strides1
+    b_strides2 = a_strides2
+    s_strides2 = c_strides2
+    score = torch.randn((M, E), dtype=dtype, device=device)
+    topk_weights, topk_ids = select_experts(
+        hidden_states=a,
+        router_logits=score,
+        top_k=topk,
+        use_grouped_topk=False,
+        renormalize=False,
+    )
+    expert_map = torch.arange(E, dtype=torch.int32, device=device)
+    expert_map[local_e:] = E
+    output = cutlass_moe(
+        a,
+        w1_q,
+        w2_q,
+        w1_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides13,
+        s_strides2,
+        0,
+        local_e - 1,
+        E,
+        a1_scale,
+        a2_scale,
+        expert_map,
+    )
+    ref_output = ref(
+        a,
+        local_e,
+        topk_weights,
+        topk_ids,
+        ref_weight_1,
+        ref_weight_2,
+        scale_1,
+        scale_2,
+        has_pre_quant=True,
+        has_alpha=True,
+        pre_quant_scale_1=a1_scale,
+        pre_quant_scale_2=a2_scale,
+        alpha_1=a1_scale,
+        alpha_2=a2_scale,
+    )
+    # compare
+    torch.cuda.synchronize()
+    # compare final output
+    torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
+    print("SUCCESS: Final output tensors are close.")
+def cutlass_moe(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids_: torch.Tensor,
+    a_strides1: torch.Tensor,
+    b_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides13: torch.Tensor,
+    s_strides2: torch.Tensor,
+    start_expert_id: int,
+    end_expert_id: int,
+    E: int,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+):
+    local_topk_ids = topk_ids_
+    local_topk_ids = torch.where(expert_map[topk_ids_] != E, expert_map[topk_ids_], E)
+    device = a.device
+    local_num_experts = end_expert_id - start_expert_id + 1
+    expert_offsets = torch.empty(
+        (local_num_experts + 1), dtype=torch.int32, device=device
+    )
+    problem_sizes1 = torch.empty(
+        (local_num_experts, 3), dtype=torch.int32, device=device
+    )
+    problem_sizes2 = torch.empty(
+        (local_num_experts, 3), dtype=torch.int32, device=device
+    )
+    return cutlass_w4a8_moe(
+        start_expert_id,
+        end_expert_id,
+        E,
+        a,
+        w1_q,
+        w2_q,
+        w1_scale,
+        w2_scale,
+        topk_weights,
+        topk_ids_,
+        local_topk_ids,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides13,
+        s_strides2,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a1_scale,
+        a2_scale,
+        apply_router_weight_on_input,
+    )
+def ref(
+    x: torch.Tensor,
+    num_experts: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    ref_weight_1: torch.Tensor,
+    ref_weight_2: torch.Tensor,
+    ref_weight_scale_1: torch.Tensor,
+    ref_weight_scale_2: torch.Tensor,
+    has_pre_quant: bool = False,
+    has_alpha: bool = False,
+    pre_quant_scale_1: Optional[torch.Tensor] = None,
+    pre_quant_scale_2: Optional[torch.Tensor] = None,
+    alpha_1: Optional[torch.Tensor] = None,
+    alpha_2: Optional[torch.Tensor] = None,
+):
+    results = torch.zeros_like(x)
+    dtype = x.dtype
+    for e_idx in range(num_experts):
+        mask = topk_ids == e_idx
+        activated_tokens = mask.sum(1).bool()
+        act = x[activated_tokens, :]
+        if act.shape[0] == 0:
+            continue
+        final_scale = (topk_weights * mask).sum(1)[activated_tokens].unsqueeze(1)
+        act = (
+            torch.clamp((act / pre_quant_scale_1.float()), -448.0, 448.0)
+            .to(torch.float8_e4m3fn)
+            .to(dtype)
+        )
+        w3_w1 = ref_weight_1[e_idx]
+        ref_w_scale_repeat = (
+            ref_weight_scale_1[e_idx].repeat_interleave(128, dim=1).to(float)
+        )
+        w3_w1 = (w3_w1.to(float) * ref_w_scale_repeat).to(dtype)
+        fc1 = ((torch.matmul(act, w3_w1.T)) * alpha_1).to(torch.float16)
+        gate, fc1 = fc1.chunk(2, dim=-1)
+        fc1 = fc1 * torch.nn.functional.silu(gate)
+        act = (fc1 / pre_quant_scale_2.float()).to(torch.float8_e4m3fn)
+        act = act.to(dtype)
+        w2 = ref_weight_2[e_idx]
+        ref_w_scale_repeat = (
+            ref_weight_scale_2[e_idx].repeat_interleave(128, dim=1).to(float)
+        )
+        w2 = (w2.to(float) * ref_w_scale_repeat).to(dtype)
+        fc2 = (torch.matmul(act, w2.T) * alpha_2).to(torch.float16)
+        results[activated_tokens, :] += (fc2 * final_scale).to(results.dtype)
+    return results

sglang/utils.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Common utilities"""
-import base64
 import importlib
 import json
 import logging
@@ -20,6 +19,7 @@ from json import dumps
 from typing import Any, Callable, List, Optional, Tuple, Type, Union
 import numpy as np
+import pybase64
 import requests
 from IPython.display import HTML, display
 from pydantic import BaseModel
@@ -148,15 +148,15 @@ def encode_image_base64(image_path: Union[str, bytes]):
     if isinstance(image_path, str):
         with open(image_path, "rb") as image_file:
             data = image_file.read()
-            return base64.b64encode(data).decode("utf-8")
+            return pybase64.b64encode(data).decode("utf-8")
     elif isinstance(image_path, bytes):
-        return base64.b64encode(image_path).decode("utf-8")
+        return pybase64.b64encode(image_path).decode("utf-8")
     else:
         # image_path is PIL.WebPImagePlugin.WebPImageFile
         image = image_path
         buffered = BytesIO()
         image.save(buffered, format="PNG")
-        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return pybase64.b64encode(buffered.getvalue()).decode("utf-8")
 def encode_frame(frame):
@@ -223,7 +223,7 @@ def encode_video_base64(video_path: str, num_frames: int = 16):
     video_bytes = b"".join(encoded_frames)
     # Encode the concatenated bytes to base64
-    video_base64 = "video:" + base64.b64encode(video_bytes).decode("utf-8")
+    video_base64 = "video:" + pybase64.b64encode(video_bytes).decode("utf-8")
     return video_base64

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.9"
1	+ __version__ = "0.4.9.post1"

{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.9
+Version: 0.4.9.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -239,6 +239,7 @@ Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
 Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: pynvml; extra == "runtime-common"
+Requires-Dist: pybase64; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
@@ -248,7 +249,7 @@ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
 Requires-Dist: timm==1.0.16; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.20; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
@@ -419,7 +420,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
 ## Adoption and Sponsorship
-SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
+SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>

sglang 0.4.9__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.9py3-none-any.whl → 0.4.9.post1py3-none-any.whl