PyPI - sglang - Versions diffs - 0.4.9.post4__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl - Mend

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +7 -0
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +16 -1
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mooncake/conn.py +16 -0
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/http_server.py +13 -1
sglang/srt/entrypoints/openai/protocol.py +3 -1
sglang/srt/entrypoints/openai/serving_base.py +5 -2
sglang/srt/entrypoints/openai/serving_chat.py +132 -79
sglang/srt/function_call/ebnf_composer.py +10 -3
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/qwen3_coder_detector.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +14 -3
sglang/srt/layers/moe/ep_moe/layer.py +323 -242
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
sglang/srt/layers/moe/topk.py +90 -24
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +27 -10
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/lora/lora_registry.py +93 -29
sglang/srt/managers/cache_controller.py +9 -7
sglang/srt/managers/data_parallel_controller.py +4 -0
sglang/srt/managers/io_struct.py +12 -0
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +14 -8
sglang/srt/managers/scheduler.py +64 -1
sglang/srt/managers/scheduler_input_blocker.py +106 -0
sglang/srt/managers/tokenizer_manager.py +80 -15
sglang/srt/managers/tp_worker.py +8 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -2
sglang/srt/model_executor/model_runner.py +83 -27
sglang/srt/models/deepseek_v2.py +75 -84
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/qwen2_moe.py +2 -2
sglang/srt/models/qwen3_moe.py +17 -71
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/poll_based_barrier.py +31 -0
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +65 -6
sglang/srt/two_batch_overlap.py +8 -3
sglang/srt/utils.py +96 -1
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +118 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/METADATA +5 -4
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/RECORD +97 -80
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/top_level.txt +0 -0

sglang/srt/utils.py CHANGED Viewed

@@ -15,6 +15,7 @@
 from __future__ import annotations
+import asyncio
 import builtins
 import ctypes
 import dataclasses
@@ -85,6 +86,8 @@ from torch.profiler import ProfilerActivity, profile, record_function
 from torch.utils._contextlib import _DecoratorContextManager
 from triton.runtime.cache import FileCacheManager
+from sglang.srt.metrics.func_timer import enable_func_timer
 logger = logging.getLogger(__name__)
 show_time_cost = False
@@ -2049,7 +2052,7 @@ def rank0_log(msg: str):
         logger.info(msg)
-def launch_dummy_health_check_server(host, port):
+def launch_dummy_health_check_server(host, port, enable_metrics):
     import asyncio
     import uvicorn
@@ -2067,6 +2070,11 @@ def launch_dummy_health_check_server(host, port):
         """Check the health of the http server."""
         return Response(status_code=200)
+    # Add prometheus middleware
+    if enable_metrics:
+        add_prometheus_middleware(app)
+        enable_func_timer()
     config = uvicorn.Config(
         app,
         host=host,
@@ -2335,6 +2343,7 @@ def is_fa3_default_architecture(hf_config):
         "Gemma3ForConditionalGeneration",
         "Qwen3ForCausalLM",
         "Qwen3MoeForCausalLM",
+        "Glm4MoeForCausalLM",
     }
     return architectures[0] in default_archs
@@ -2855,3 +2864,89 @@ SUPPORTED_LORA_TARGET_MODULES = [
 ]
 LORA_TARGET_ALL_MODULES = "all"
+class ConcurrentCounter:
+    """
+    An asynchronous counter for managing concurrent tasks that need
+    coordinated increments, decrements, and waiting until the count reaches zero.
+    This class is useful for scenarios like tracking the number of in-flight tasks
+    and waiting for them to complete.
+    """
+    def __init__(self, initial: int = 0):
+        """
+        Initialize the counter with an optional initial value.
+        Args:
+            initial (int): The initial value of the counter. Default is 0.
+        """
+        self._count = initial
+        self._condition = asyncio.Condition()
+    def value(self) -> int:
+        """
+        Return the current value of the counter.
+        Note:
+            This method is not synchronized. It may return a stale value
+            if other coroutines are concurrently modifying the counter.
+        Returns:
+            int: The current counter value.
+        """
+        return self._count
+    def __repr__(self) -> str:
+        """Return an informative string representation of the counter."""
+        return f"<ConcurrentCounter value={self.value()}>"
+    async def increment(self, n: int = 1, notify_all: bool = True):
+        """
+        Atomically increment the counter by a given amount and notify all waiters.
+        Args:
+            n (int): The amount to increment the counter by. Default is 1.
+            notify_all (bool): Whether to notify all waiters after incrementing. Default is True.
+        """
+        async with self._condition:
+            self._count += n
+            if notify_all:
+                self._condition.notify_all()
+    async def decrement(self, n: int = 1, notify_all: bool = True):
+        """
+        Atomically decrement the counter by a given amount and notify all waiters.
+        Args:
+            n (int): The amount to decrement the counter by. Default is 1.
+            notify_all (bool): Whether to notify all waiters after decrementing. Default is True.
+        """
+        async with self._condition:
+            self._count -= n
+            if notify_all:
+                self._condition.notify_all()
+    async def wait_for(self, condition: Callable[[int], bool]):
+        """
+        Asynchronously wait until the counter satisfies a given condition.
+        This suspends the calling coroutine without blocking the thread, allowing
+        other tasks to run while waiting. When the condition is met, the coroutine resumes.
+        Args:
+            condition (Callable[[int], bool]): A function that takes the current counter value
+                and returns True when the condition is satisfied.
+        """
+        async with self._condition:
+            await self._condition.wait_for(lambda: condition(self._count))
+    async def wait_for_zero(self):
+        """
+        Asynchronously wait until the counter reaches zero.
+        This suspends the calling coroutine without blocking the thread, allowing
+        other tasks to run while waiting. When the counter becomes zero, the coroutine resumes.
+        """
+        self.wait_for(lambda count: count == 0)

sglang/srt/weight_sync/utils.py ADDED Viewed

@@ -0,0 +1,119 @@
+from typing import Optional
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor import DTensor
+from sglang.srt.entrypoints.engine import Engine
+from sglang.srt.managers.tokenizer_manager import UpdateWeightsFromTensorReqInput
+from sglang.srt.model_executor.model_runner import LocalSerializedTensor
+from sglang.srt.utils import MultiprocessingSerializer
+async def update_weights(
+    engine: Engine,
+    params_batch: list[tuple[str, torch.Tensor]],
+    device_mesh_key: str,
+    device_mesh: DeviceMesh,
+    load_format: Optional[str] = None,
+):
+    """
+    Update weights for the inference engine.
+    This function is designed to be stateless, so that the caller process could keep the stateful engine.
+    Example Use Case:
+        - Multiple Producer Process will call this function in a SPMD style
+    Args:
+        engine: The inference engine created by the caller process.
+        params_batch: A list of (name, tensor) tuples. We batched the tensors to avoid the overhead of cpu call.
+        device_mesh_key: The key of the device mesh. Typically "tp" or "infer_tp"
+        device_mesh: The device mesh.
+        load_format: The format of the weights.
+    """
+    infer_tp_size = device_mesh[device_mesh_key].mesh.size()[0]
+    infer_tp_rank = device_mesh[device_mesh_key].get_local_rank()
+    from sglang.srt.patch_torch import monkey_patch_torch_reductions
+    monkey_patch_torch_reductions()
+    # [
+    #   (name0, ipc_tensor0_tp0),
+    #   (name1, ipc_tensor1_tp0),
+    # ]
+    named_tensors_batch = [
+        (
+            name,
+            MultiprocessingSerializer.serialize(
+                _preprocess_tensor_for_update_weights(tensor)
+            ),
+        )
+        for name, tensor in params_batch
+    ]
+    if infer_tp_rank == 0:
+        gathered_serialized_batches = [None for _ in range(infer_tp_size)]
+    else:
+        gathered_serialized_batches = None
+    # [
+    #   [ (name0, ipc_tensor0_tp0), (name1, ipc_tensor1_tp0) ],
+    #   [ (name0, ipc_tensor0_tp1), (name1, ipc_tensor1_tp1) ],
+    # ]
+    dist.gather_object(
+        obj=named_tensors_batch,
+        object_gather_list=gathered_serialized_batches,
+        dst=device_mesh[device_mesh_key].mesh.tolist()[0],
+        group=device_mesh[device_mesh_key].get_group(),
+    )
+    if infer_tp_rank == 0:
+        # Use zip(*) to "transpose" the data structure.
+        # After transpose, the data structure is like:
+        # [
+        #   ( (name0, ipc_tensor0_tp0), (name0, ipc_tensor0_tp1) ),
+        #   ( (name1, ipc_tensor1_tp0), (name1, ipc_tensor1_tp1) ),
+        # ]
+        logical_tensors = zip(*gathered_serialized_batches, strict=True)
+        named_tensors = [
+            # [
+            #   (name0, LocalSerializedTensor(values=[ipc_tensor0_tp0, ipc_tensor0_tp1])),
+            #   (name1, LocalSerializedTensor(values=[ipc_tensor1_tp0, ipc_tensor1_tp1])),
+            # ]
+            (
+                tensor_group[0][0],
+                LocalSerializedTensor(
+                    values=[rank_part[1] for rank_part in tensor_group]
+                ),
+            )
+            for tensor_group in logical_tensors
+        ]
+        update_weights_request = UpdateWeightsFromTensorReqInput(
+            serialized_named_tensors=[
+                MultiprocessingSerializer.serialize(named_tensors)
+                for _ in range(infer_tp_size)
+            ],
+            load_format=load_format,
+        )
+        return await engine.update_weights_from_tensor(update_weights_request)
+def _preprocess_tensor_for_update_weights(tensor: torch.Tensor):
+    """
+    Preprocess the tensor for update weights.
+    Example Use Case:
+        - FSDP: we gather tensor by calling full_tensor in _preprocess_tensor_for_update_weights
+        - Megatron: we do nothing here, assuming it is gathered when feed into this func
+    Args:
+        tensor: The tensor to be preprocessed.
+    Returns:
+        The full tensor if it is a DTensor, otherwise the original tensor.
+    """
+    if isinstance(tensor, DTensor):
+        return tensor.full_tensor()
+    return tensor

sglang/test/runners.py CHANGED Viewed

@@ -491,6 +491,8 @@ class SRTRunner:
         lora_paths: List[str] = None,
         max_loras_per_batch: int = 4,
         attention_backend: Optional[str] = None,
+        prefill_attention_backend: Optional[str] = None,
+        decode_attention_backend: Optional[str] = None,
         lora_backend: str = "triton",
         disable_cuda_graph: bool = False,
         disable_radix_cache: bool = False,
@@ -540,6 +542,8 @@ class SRTRunner:
             max_loras_per_batch=max_loras_per_batch,
             lora_backend=lora_backend,
             attention_backend=attention_backend,
+            prefill_attention_backend=prefill_attention_backend,
+            decode_attention_backend=decode_attention_backend,
             disable_cuda_graph=disable_cuda_graph,
             disable_radix_cache=disable_radix_cache,
             chunked_prefill_size=chunked_prefill_size,

sglang/test/test_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Common utilities for testing and benchmarking"""
 import argparse
+import asyncio
 import copy
 import json
 import logging
@@ -14,9 +15,11 @@ import unittest
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from functools import partial
+from pathlib import Path
 from types import SimpleNamespace
-from typing import Callable, List, Optional, Tuple
+from typing import Awaitable, Callable, List, Optional, Tuple
+import aiohttp
 import numpy as np
 import requests
 import torch
@@ -26,6 +29,7 @@ from sglang.bench_serving import run_benchmark
 from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.lang.interpreter import ProgramState
 from sglang.srt.utils import (
     get_bool_env_var,
     get_device,
@@ -347,6 +351,7 @@ def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
         help="Device type (auto/cuda/rocm/cpu). Auto will detect available platforms",
     )
     parser.add_argument("--result-file", type=str, default="result.jsonl")
+    parser.add_argument("--raw-result-file", type=str)
     args = parser.parse_args()
     return args
@@ -714,6 +719,7 @@ def get_benchmark_args(
     seed: int = 0,
     device="auto",
     pd_separated: bool = False,
+    lora_name=None,
 ):
     return SimpleNamespace(
         backend="sglang",
@@ -741,7 +747,7 @@ def get_benchmark_args(
         extra_request_body=None,
         apply_chat_template=False,
         profile=None,
-        lora_name=None,
+        lora_name=lora_name,
         prompt_suffix="",
         device=device,
         pd_separated=pd_separated,
@@ -764,6 +770,8 @@ def run_bench_serving(
     need_warmup=False,
     seed: int = 0,
     device="auto",
+    background_task: Optional[Callable[[str, asyncio.Event], Awaitable[None]]] = None,
+    lora_name: Optional[str] = None,
 ):
     if device == "auto":
         device = auto_config_device()
@@ -791,14 +799,35 @@ def run_bench_serving(
         disable_ignore_eos=disable_ignore_eos,
         seed=seed,
         device=device,
+        lora_name=lora_name,
     )
-    try:
+    async def _run():
         if need_warmup:
             warmup_args = copy.deepcopy(args)
             warmup_args.num_prompts = 16
-            run_benchmark(warmup_args)
-        res = run_benchmark(args)
+            await asyncio.to_thread(run_benchmark, warmup_args)
+        start_event = asyncio.Event()
+        stop_event = asyncio.Event()
+        task_handle = (
+            asyncio.create_task(background_task(base_url, start_event, stop_event))
+            if background_task
+            else None
+        )
+        try:
+            start_event.set()
+            result = await asyncio.to_thread(run_benchmark, args)
+        finally:
+            if task_handle:
+                stop_event.set()
+                await task_handle
+        return result
+    try:
+        res = asyncio.run(_run())
     finally:
         kill_process_tree(process.pid)
@@ -1275,6 +1304,58 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
                                 raise
+def send_generate_requests(base_url: str, num_requests: int) -> List[str]:
+    """Sends generate request serially and returns status codes. Max concurrency is 1."""
+    def generate():
+        prompt = """
+        System: You are a helpful assistant.
+        User: What is the capital of France?
+        Assistant: The capital of France is
+        """
+        response = requests.post(
+            f"{base_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": 0,
+                    "max_new_tokens": 50,
+                },
+            },
+        )
+        return response.status_code
+    return [generate() for _ in range(num_requests)]
+async def send_concurrent_generate_requests(
+    base_url: str, num_requests: int
+) -> List[str]:
+    """Sends generate request concurrently and returns status codes. Max concurrency is num_requests."""
+    async def async_generate():
+        async with aiohttp.ClientSession() as session:
+            prompt = """
+            System: You are a helpful assistant.
+            User: What is the capital of France?
+            Assistant: The capital of France is
+            """
+            async with session.post(
+                f"{base_url}/generate",
+                json={
+                    "text": prompt,
+                    "sampling_params": {
+                        "temperature": 0,
+                        "max_new_tokens": 50,
+                    },
+                },
+            ) as response:
+                return response.status
+    tasks = [asyncio.create_task(async_generate()) for _ in range(num_requests)]
+    return await asyncio.gather(*tasks)
 class CustomTestCase(unittest.TestCase):
     def _callTestMethod(self, method):
         max_retry = int(
@@ -1284,3 +1365,35 @@ class CustomTestCase(unittest.TestCase):
             lambda: super(CustomTestCase, self)._callTestMethod(method),
             max_retry=max_retry,
         )
+def dump_bench_raw_result(
+    path: str,
+    states,
+    preds,
+    labels,
+):
+    if not path:
+        return
+    rows = []
+    for i in range(len(states)):
+        state = states[i]
+        output = state["answer"]
+        prompt = _ensure_remove_suffix(state.text(), output)
+        rows.append(
+            dict(
+                prompt_id=i,
+                prompt=prompt,
+                output=output,
+                correct=bool(preds[i] == labels[i]),
+            )
+        )
+    print(f"BenchRawResultDumper save results to {path}")
+    Path(path).write_text("\n".join(json.dumps(row) for row in rows))
+def _ensure_remove_suffix(text: str, suffix: str):
+    assert text.endswith(suffix)
+    return text.removesuffix(suffix)

sglang/utils.py CHANGED Viewed

@@ -14,6 +14,7 @@ import traceback
 import urllib.request
 import weakref
 from concurrent.futures import ThreadPoolExecutor
+from functools import wraps
 from io import BytesIO
 from json import dumps
 from typing import Any, Callable, List, Optional, Tuple, Type, Union
@@ -28,6 +29,24 @@ from tqdm import tqdm
 logger = logging.getLogger(__name__)
+def execute_once(func):
+    has_run = None
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        nonlocal has_run
+        if not has_run:
+            func(*args, **kwargs)
+            has_run = True
+    return wrapper
+@execute_once
+def info_once(message: str):
+    logger.info(message)
 def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str:
     """Convert a JSON schema to a string.
     Parameters

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.9.~~post4~~"
1	+ __version__ = "0.4.9.post6"

{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.9.post4
+Version: 0.4.9.post6
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -246,7 +246,7 @@ Requires-Dist: sentencepiece; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: scipy; extra == "runtime-common"
 Requires-Dist: torchao==0.9.0; extra == "runtime-common"
-Requires-Dist: transformers==4.53.2; extra == "runtime-common"
+Requires-Dist: transformers==4.54.0; extra == "runtime-common"
 Requires-Dist: timm==1.0.16; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
@@ -259,7 +259,7 @@ Requires-Dist: torchaudio==2.7.1; extra == "srt"
 Requires-Dist: torchvision==0.22.1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: einops; extra == "srt"
-Requires-Dist: flashinfer_python==0.2.9rc1; extra == "srt"
+Requires-Dist: flashinfer_python==0.2.9rc2; extra == "srt"
 Provides-Extra: blackwell
 Requires-Dist: sglang[runtime_common]; extra == "blackwell"
 Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -268,7 +268,8 @@ Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
 Requires-Dist: torchvision==0.22.1; extra == "blackwell"
 Requires-Dist: cuda-python; extra == "blackwell"
 Requires-Dist: einops; extra == "blackwell"
-Requires-Dist: flashinfer_python==0.2.9rc1; extra == "blackwell"
+Requires-Dist: flashinfer_python==0.2.9rc2; extra == "blackwell"
+Requires-Dist: tiktoken; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"

sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post6__py3-none-any.whl

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post6py3-none-any.whl