PyPI - sglang - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl - Mend

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

sglang/bench_latency.py +2 -1
sglang/lang/chat_template.py +17 -0
sglang/launch_server_llavavid.py +1 -1
sglang/srt/configs/__init__.py +3 -0
sglang/srt/configs/model_config.py +27 -2
sglang/srt/configs/qwen2vl.py +133 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/conversation.py +27 -0
sglang/srt/hf_transformers_utils.py +16 -1
sglang/srt/layers/attention/__init__.py +16 -5
sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
sglang/srt/layers/attention/flashinfer_backend.py +174 -54
sglang/srt/layers/attention/triton_backend.py +22 -6
sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
sglang/srt/layers/linear.py +89 -63
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/rotary_embedding.py +112 -0
sglang/srt/layers/sampler.py +51 -39
sglang/srt/lora/lora.py +3 -1
sglang/srt/managers/data_parallel_controller.py +1 -1
sglang/srt/managers/detokenizer_manager.py +4 -0
sglang/srt/managers/image_processor.py +186 -13
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/schedule_batch.py +238 -68
sglang/srt/managers/scheduler.py +69 -50
sglang/srt/managers/tokenizer_manager.py +24 -4
sglang/srt/managers/tp_worker.py +26 -111
sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
sglang/srt/mem_cache/memory_pool.py +56 -10
sglang/srt/mem_cache/radix_cache.py +4 -3
sglang/srt/model_executor/cuda_graph_runner.py +87 -28
sglang/srt/model_executor/forward_batch_info.py +83 -3
sglang/srt/model_executor/model_runner.py +32 -11
sglang/srt/models/chatglm.py +3 -3
sglang/srt/models/deepseek_v2.py +2 -2
sglang/srt/models/mllama.py +1004 -0
sglang/srt/models/qwen2_vl.py +724 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +13 -3
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +12 -0
sglang/srt/server_args.py +10 -0
sglang/srt/utils.py +22 -0
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +20 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +100 -3
sglang/version.py +1 -1
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py CHANGED Viewed

@@ -31,9 +31,12 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
         padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
             sequences=[
                 torch.tensor(
-                    data=list(
-                        req.sampling_params.stop_token_ids
-                        | {req.tokenizer.eos_token_id}
+                    data=(
+                        list(
+                            (req.sampling_params.stop_token_ids or set())
+                            | (req.tokenizer.additional_stop_token_ids or set())
+                            | {req.tokenizer.eos_token_id}
+                        )
                     ),
                     dtype=torch.int64,
                     device=self.orchestrator.device,

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -51,7 +51,7 @@ class SamplingBatchInfo:
         disable_penalizer: bool,
     ):
         reqs = batch.reqs
-        device = batch.input_ids.device
+        device = batch.device
         temperatures = (
             torch.tensor(
                 [r.sampling_params.temperature for r in reqs],
@@ -78,7 +78,7 @@ class SamplingBatchInfo:
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             is_all_greedy=top_ks.max().item() <= 1,
             vocab_size=vocab_size,
-            device=batch.input_ids.device,
+            device=device,
         )
         # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
@@ -95,7 +95,7 @@ class SamplingBatchInfo:
             ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
                 vocab_size=vocab_size,
                 batch=batch,
-                device=batch.input_ids.device,
+                device=batch.device,
                 Penalizers={
                     penaltylib.BatchedFrequencyPenalizer,
                     penaltylib.BatchedMinNewTokensPenalizer,
@@ -224,3 +224,13 @@ class SamplingBatchInfo:
             vocab_size=self.vocab_size,
             device=self.device,
         )
+    def to(self, device: str):
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+        ]:
+            value = getattr(self, item)
+            setattr(self, item, value.to(device, non_blocking=True))

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -50,9 +50,10 @@ class SamplingParams:
         self.presence_penalty = presence_penalty
         self.repetition_penalty = repetition_penalty
         self.stop_strs = stop
-        if stop_token_ids is None:
-            stop_token_ids = []
-        self.stop_token_ids = {*stop_token_ids}
+        if stop_token_ids:
+            self.stop_token_ids = set(stop_token_ids)
+        else:
+            self.stop_token_ids = None
         self.max_new_tokens = max_new_tokens
         self.min_new_tokens = min_new_tokens
         self.ignore_eos = ignore_eos
@@ -119,10 +120,7 @@ class SamplingParams:
         # Process stop strings
         if self.stop_strs is None:
             self.stop_strs = []
-            if self.stop_token_ids is None:
-                self.stop_str_max_len = 0
-            else:
-                self.stop_str_max_len = 1
+            self.stop_str_max_len = 0
         else:
             if isinstance(self.stop_strs, str):
                 self.stop_strs = [self.stop_strs]

sglang/srt/server.py CHANGED Viewed

@@ -172,6 +172,18 @@ async def stop_profile():
     )
+@app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
+async def get_memory_pool_size():
+    """Get the memory pool size in number of tokens"""
+    try:
+        ret = await tokenizer_manager.get_memory_pool_size()
+        return ret.size
+    except Exception as e:
+        return JSONResponse(
+            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+        )
 @app.post("/update_weights")
 async def update_weights(obj: UpdateWeightReqInput, request: Request):
     """Update the weights inplace without re-launching the server."""

sglang/srt/server_args.py CHANGED Viewed

@@ -177,6 +177,16 @@ class ServerArgs:
         if self.sampling_backend is None:
             self.sampling_backend = "flashinfer"
+        if self.enable_overlap_schedule:
+            logger.warning(
+                "Overlap scheduler mode is enabled. This is an experimental feature. "
+                "Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
+                "and embedding APIs are not supported and will lead to wrong results. "
+                "The NaN detection is also disabled."
+            )
+            self.disable_penalizer = True
+            self.disable_nan_detection = True
         # Model-specific patches
         if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
             logger.info(

sglang/srt/utils.py CHANGED Viewed

@@ -209,6 +209,28 @@ def is_multimodal_model(model_architectures):
         or "LlavaQwenForCausalLM" in model_architectures
         or "LlavaMistralForCausalLM" in model_architectures
         or "LlavaVidForCausalLM" in model_architectures
+        or "MllamaForConditionalGeneration" in model_architectures
+        or "Qwen2VLForConditionalGeneration" in model_architectures
+    ):
+        return True
+    else:
+        return False
+def is_attention_free_model(model_architectures):
+    return False
+def model_has_inner_state(model_architectures):
+    return False
+def is_embedding_model(model_architectures):
+    if (
+        "LlamaEmbeddingModel" in model_architectures
+        or "MistralModel" in model_architectures
+        or "LlamaForSequenceClassification" in model_architectures
+        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
     ):
         return True
     else:

sglang/test/run_eval.py CHANGED Viewed

@@ -67,6 +67,7 @@ def run_eval(args):
         model=args.model,
         max_tokens=2048,
         base_url=base_url,
+        temperature=getattr(args, "temperature", 0.0),
     )
     # Run eval
@@ -119,6 +120,7 @@ if __name__ == "__main__":
     parser.add_argument("--eval-name", type=str, default="mmlu")
     parser.add_argument("--num-examples", type=int)
     parser.add_argument("--num-threads", type=int, default=512)
+    parser.add_argument("--temperature", type=float, default=0.0)
     args = parser.parse_args()
     run_eval(args)

sglang/test/runners.py CHANGED Viewed

@@ -102,8 +102,10 @@ class HFRunner:
         return False
     def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
-        self.tokenizer = get_tokenizer(model_path, torch_dtype=torch.dtype)
+        # Apply model-specific patches
+        monkey_patch_gemma2_sdpa()
+        # Load the model and tokenizer
         if self.model_type == "generation":
             self.base_model = AutoModelForCausalLM.from_pretrained(
                 model_path,
@@ -128,7 +130,9 @@ class HFRunner:
             ).cuda()
         else:
             raise Exception(f"Unrecognized model type {self.model_type}")
+        self.tokenizer = get_tokenizer(model_path, torch_dtype=torch.dtype)
+        # Run forward
         while True:
             prompts, max_new_tokens, lora_paths = in_queue.get()
             if lora_paths is not None:
@@ -370,3 +374,18 @@ class SRTRunner:
     def __exit__(self, exc_type, exc_value, traceback):
         self.runtime.shutdown()
         del self.runtime
+def monkey_patch_gemma2_sdpa():
+    """
+    Use sdpa by default to fix the OOM issue.
+    Revert this commit:
+    https://github.com/huggingface/transformers/commit/975b988bfe6e7ebb47390cd9a1556c6888804883#diff-5f76eac6f18f4b491521314c318a9692318feb4d19228e9576cce7bde4240834R660
+    """
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2PreTrainedModel
+    def _check_and_enable_sdpa(config, hard_check_only: bool = False):
+        config._attn_implementation = "sdpa"
+        return config
+    setattr(Gemma2PreTrainedModel, "_check_and_enable_sdpa", _check_and_enable_sdpa)

sglang/test/srt/sampling/penaltylib/utils.py CHANGED Viewed

@@ -24,6 +24,7 @@ class MockSamplingParams:
 @dataclasses.dataclass
 class MockTokenizer:
     eos_token_id: int
+    additional_stop_token_ids: typing.Optional[typing.List[int]] = None
 @dataclasses.dataclass

sglang/test/test_utils.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import argparse
 import asyncio
 import os
+import random
 import subprocess
 import threading
 import time
@@ -20,6 +21,7 @@ from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
 from sglang.utils import get_exception_traceback
 DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
@@ -400,7 +402,7 @@ def popen_launch_server(
     api_key: Optional[str] = None,
     other_args: tuple = (),
     env: Optional[dict] = None,
-    return_stdout_stderr: bool = False,
+    return_stdout_stderr: Optional[tuple] = None,
 ):
     _, host, port = base_url.split(":")
     host = host[2:]
@@ -423,8 +425,8 @@ def popen_launch_server(
     if return_stdout_stderr:
         process = subprocess.Popen(
             command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stdout=return_stdout_stderr[0],
+            stderr=return_stdout_stderr[1],
             env=env,
             text=True,
         )
@@ -631,3 +633,98 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
         rouge_l_scores.append(fmeasure)
     return rouge_l_scores
+STDOUT_FILENAME = "stdout.txt"
+STDERR_FILENAME = "stderr.txt"
+def read_output(output_lines):
+    """Print the output in real time with another thread."""
+    while not os.path.exists(STDERR_FILENAME):
+        time.sleep(1)
+    pt = 0
+    while pt >= 0:
+        if pt > 0 and not os.path.exists(STDERR_FILENAME):
+            break
+        lines = open(STDERR_FILENAME).readlines()
+        for line in lines[pt:]:
+            print(line, end="", flush=True)
+            output_lines.append(line)
+            pt += 1
+        time.sleep(0.1)
+def run_mmlu_test(
+    disable_radix_cache,
+    enable_mixed_chunk=False,
+    enable_overlap=False,
+    chunked_prefill_size=32,
+):
+    other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
+    if disable_radix_cache:
+        other_args += ["--disable-radix-cache"]
+    if enable_mixed_chunk:
+        other_args += ["--enable-mixed-chunk"]
+    if enable_overlap:
+        other_args += ["--enable-overlap-scheduler"]
+    model = DEFAULT_MODEL_NAME_FOR_TEST
+    port = random.randint(4000, 5000)
+    base_url = f"http://127.0.0.1:{port}"
+    # Create files and launch the server
+    stdout = open(STDOUT_FILENAME, "w")
+    stderr = open(STDERR_FILENAME, "w")
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_args,
+        return_stdout_stderr=(stdout, stderr),
+    )
+    # Launch a thread to stream the output
+    output_lines = []
+    t = threading.Thread(target=read_output, args=(output_lines,))
+    t.start()
+    # Run the eval
+    args = SimpleNamespace(
+        base_url=base_url,
+        model=model,
+        eval_name="mmlu",
+        num_examples=128,
+        num_threads=128,
+    )
+    try:
+        metrics = run_eval(args)
+        print(f"{metrics=}")
+        assert metrics["score"] >= 0.65
+    finally:
+        pass
+    # Clean up everything
+    kill_child_process(process.pid)
+    kill_child_process(process.pid)
+    stdout.close()
+    stderr.close()
+    if os.path.exists(STDOUT_FILENAME):
+        os.remove(STDOUT_FILENAME)
+    if os.path.exists(STDERR_FILENAME):
+        os.remove(STDERR_FILENAME)
+    t.join()
+    # Assert success
+    has_new_server = False
+    has_leak = False
+    for line in output_lines:
+        if "The server is fired" in line:
+            has_new_server = True
+        if "leak" in line:
+            has_leak = True
+    assert has_new_server
+    # assert not has_leak

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.4"
1	+ __version__ = "0.3.4.post2"

{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.4
+Version: 0.3.4.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -259,7 +259,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
 Requires-Dist: torch; extra == "srt"
-Requires-Dist: vllm==0.5.5; extra == "srt"
+Requires-Dist: vllm==0.6.3.post1; extra == "srt"
 Provides-Extra: srt_xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
 Provides-Extra: test
@@ -284,17 +284,17 @@ Requires-Dist: peft; extra == "test"
 --------------------------------------------------------------------------------
 | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
-[**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
+[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
 ## News
 - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
 - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
 - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
-- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 <details>
 <summary>More</summary>
+- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -328,23 +328,27 @@ You can install SGLang using any of the methods below.
 pip install --upgrade pip
 pip install "sglang[all]"
-# Install FlashInfer CUDA kernels
+# Install FlashInfer accelerated kernels
 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
+Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
 pip install -e "python[all]"
-# Install FlashInfer CUDA kernels
+# Install FlashInfer accelerated kernels
 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ```
+Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
 ### Method 3: Using docker
 The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
 Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
@@ -498,7 +502,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
 ```
-- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
+- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
+- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
 - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
@@ -519,7 +524,6 @@ We also provide an inference engine **without a HTTP server**. For example,
 ```python
 import sglang as sgl
 def main():
     prompts = [
         "Hello, my name is",
@@ -539,12 +543,8 @@ if __name__ == "__main__":
     main()
 ```
-This can be used for:
-1. **Offline Batch Inference**
-2. **Building Custom Servers**
-You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
+This can be used for offline batch inference and building custom servers.
+You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
 ### Supported Models
@@ -552,7 +552,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
 - Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
-- Qwen / Qwen 2 / Qwen 2 MoE
+- Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
 - DeepSeek / DeepSeek 2
 - OLMoE
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -575,6 +575,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
 - MiniCPM / MiniCPM 3
 - XVERSE / XVERSE MoE
 - SmolLM
+- GLM-4
 **Embedding Models**
@@ -711,7 +712,6 @@ print(state["answer_1"])
 ```
 #### More Examples
 Anthropic and VertexAI (Gemini) models are also supported.
 You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
@@ -892,7 +892,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
 We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
 <p align="center">
   <a href="#sglangtop" target="_blank">
   <bold>Back To Top </bold>

sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl