PyPI - sglang - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl - Mend

sglang 0.3.1py3-none-any.whl → 0.3.1.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

sglang/bench_latency.py +10 -3
sglang/bench_server_latency.py +187 -0
sglang/bench_serving.py +1 -1
sglang/global_config.py +5 -13
sglang/lang/interpreter.py +0 -3
sglang/srt/constrained/fsm_cache.py +5 -1
sglang/srt/layers/activation.py +16 -1
sglang/srt/layers/attention_backend.py +12 -12
sglang/srt/layers/fused_moe/layer.py +27 -7
sglang/srt/layers/layernorm.py +21 -6
sglang/srt/layers/sampler.py +40 -98
sglang/srt/lora/lora_manager.py +11 -8
sglang/srt/managers/io_struct.py +3 -0
sglang/srt/managers/policy_scheduler.py +49 -93
sglang/srt/managers/schedule_batch.py +2 -1
sglang/srt/managers/tp_worker.py +19 -13
sglang/srt/model_executor/cuda_graph_runner.py +25 -13
sglang/srt/model_executor/model_runner.py +37 -46
sglang/srt/models/deepseek_v2.py +8 -3
sglang/srt/models/llama.py +1 -3
sglang/srt/models/llama_classification.py +2 -3
sglang/srt/models/minicpm3.py +7 -3
sglang/srt/models/olmoe.py +415 -0
sglang/srt/models/xverse.py +1 -3
sglang/srt/models/xverse_moe.py +1 -4
sglang/srt/sampling/sampling_batch_info.py +3 -50
sglang/srt/server.py +6 -1
sglang/srt/server_args.py +39 -10
sglang/srt/utils.py +7 -51
sglang/test/few_shot_gsm8k.py +8 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/METADATA +4 -5
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/RECORD +37 -35
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/WHEEL +1 -1
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -21,6 +21,8 @@ import logging
 import random
 from typing import List, Optional, Union
+from sglang.srt.utils import is_hip
 logger = logging.getLogger(__name__)
@@ -59,6 +61,7 @@ class ServerArgs:
     tp_size: int = 1
     stream_interval: int = 1
     random_seed: Optional[int] = None
+    constrained_json_whitespace_pattern: Optional[str] = None
     # Logging
     log_level: str = "info"
@@ -94,11 +97,12 @@ class ServerArgs:
     disable_cuda_graph_padding: bool = False
     disable_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
+    disable_mla: bool = False
     enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
+    max_torch_compile_bs: int = 32
     torchao_config: str = ""
     enable_p2p_check: bool = False
-    enable_mla: bool = False
     triton_attention_reduce_in_fp32: bool = False
     # LoRA
@@ -152,11 +156,12 @@ class ServerArgs:
             )
             self.sampling_backend = "pytorch"
-        # Default kernel backends
-        if self.enable_mla:
-            logger.info("MLA optimization is tunred on. Use triton backend.")
+        # ROCm: flashinfer available later
+        if is_hip():
             self.attention_backend = "triton"
+            self.sampling_backend = "pytorch"
+        # Default kernel backends
         if self.attention_backend is None:
             self.attention_backend = "flashinfer"
@@ -359,6 +364,12 @@ class ServerArgs:
             default=ServerArgs.random_seed,
             help="The random seed.",
         )
+        parser.add_argument(
+            "--constrained-json-whitespace-pattern",
+            type=str,
+            default=ServerArgs.constrained_json_whitespace_pattern,
+            help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
+        )
         parser.add_argument(
             "--log-level",
             type=str,
@@ -488,6 +499,11 @@ class ServerArgs:
             default=False,
             help="Disable the custom all-reduce kernel and fall back to NCCL.",
         )
+        parser.add_argument(
+            "--disable-mla",
+            action="store_true",
+            help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
+        )
         parser.add_argument(
             "--enable-mixed-chunk",
             action="store_true",
@@ -498,6 +514,12 @@ class ServerArgs:
             action="store_true",
             help="Optimize the model with torch.compile. Experimental feature.",
         )
+        parser.add_argument(
+            "--max-torch-compile-bs",
+            type=int,
+            default=ServerArgs.max_torch_compile_bs,
+            help="Set the maximum batch size when using torch compile.",
+        )
         parser.add_argument(
             "--torchao-config",
             type=str,
@@ -509,11 +531,6 @@ class ServerArgs:
             action="store_true",
             help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
         )
-        parser.add_argument(
-            "--enable-mla",
-            action="store_true",
-            help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
-        )
         parser.add_argument(
             "--triton-attention-reduce-in-fp32",
             action="store_true",
@@ -532,7 +549,8 @@ class ServerArgs:
             type=str,
             nargs="*",
             default=None,
-            help="The list of LoRA adapters.",
+            action=LoRAPathAction,
+            help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
         )
         parser.add_argument(
             "--max-loras-per-batch",
@@ -590,3 +608,14 @@ class PortArgs:
     controller_port: int
     detokenizer_port: int
     nccl_ports: List[int]
+class LoRAPathAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, {})
+        for lora_path in values:
+            if "=" in lora_path:
+                name, path = lora_path.split("=", 1)
+                getattr(namespace, self.dest)[name] = path
+            else:
+                getattr(namespace, self.dest)[lora_path] = lora_path

sglang/srt/utils.py CHANGED Viewed

@@ -51,6 +51,11 @@ show_time_cost = False
 time_infos = {}
+# torch flag AMD GPU
+def is_hip() -> bool:
+    return torch.version.hip is not None
 def enable_show_time_cost():
     global show_time_cost
     show_time_cost = True
@@ -187,7 +192,7 @@ def allocate_init_ports(
         cur_port += 1
     if port is not None and ret_ports[0] != port:
-        logger.warn(
+        logger.warning(
             f"WARNING: Port {port} is not available. Use port {ret_ports[0]} instead."
         )
@@ -623,56 +628,7 @@ def set_ulimit(target_soft_limit=65535):
         try:
             resource.setrlimit(resource_type, (target_soft_limit, current_hard))
         except ValueError as e:
-            logger.warn(f"Fail to set RLIMIT_NOFILE: {e}")
-def is_llama3_405b_fp8_head_16(model_config):
-    """Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads."""
-    if (
-        model_config.hf_config.architectures[0] == "LlamaForCausalLM"
-        and model_config.hf_config.hidden_size == 16384
-        and model_config.hf_config.intermediate_size == 53248
-        and model_config.hf_config.num_hidden_layers == 126
-        and model_config.hf_config.num_key_value_heads == 16
-        and hasattr(model_config.hf_config, "quantization_config")
-        and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
-    ):
-        return True
-    return False
-def monkey_patch_vllm_qvk_linear_loader():
-    """A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints."""
-    from vllm.model_executor.layers.linear import QKVParallelLinear
-    origin_weight_loader = QKVParallelLinear.weight_loader
-    def get_original_weight(loaded_weight, head_dim):
-        n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
-        dim = loaded_weight.shape[1]
-        for i in range(n_kv_head):
-            loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
-                2 * i * head_dim : (2 * i + 1) * head_dim, :
-            ]
-        original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
-        assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
-        return original_kv_weight
-    def weight_loader_srt(
-        self,
-        param: Parameter,
-        loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[str] = None,
-    ):
-        if (
-            loaded_shard_id in ["k", "v"]
-            and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
-        ):
-            loaded_weight = get_original_weight(loaded_weight, self.head_size)
-        origin_weight_loader(self, param, loaded_weight, loaded_shard_id)
-    setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
+            logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
 def add_api_key_middleware(app, api_key: str):

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -44,7 +44,7 @@ def get_answer_value(answer_str):
         return INVALID
-def main(args):
+def run_eval(args):
     # Select backend
     set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
@@ -119,6 +119,12 @@ def main(args):
     # Dump results
     dump_state_text("tmp_output_gsm8k.txt", states)
+    return {
+        "accuracy": acc,
+        "latency": latency,
+        "output_throughput": output_throughput,
+    }
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -129,4 +135,4 @@ if __name__ == "__main__":
     parser.add_argument("--host", type=str, default="http://127.0.0.1")
     parser.add_argument("--port", type=int, default=30000)
     args = parser.parse_args()
-    main(args)
+    run_eval(args)

sglang/test/test_utils.py CHANGED Viewed

@@ -22,6 +22,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.utils import kill_child_process
 from sglang.utils import get_exception_traceback
+DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
@@ -304,7 +305,6 @@ def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
 def select_sglang_backend(args: argparse.Namespace):
     if args.backend.startswith("srt"):
         if args.backend == "srt-no-parallel":
-            global_config.enable_parallel_decoding = False
             global_config.enable_parallel_encoding = False
         backend = RuntimeEndpoint(f"{args.host}:{args.port}")
     elif args.backend.startswith("gpt-"):

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.1"
1	+ __version__ = "0.3.1.post2"

{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.1
+Version: 0.3.1.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -269,7 +269,7 @@ Requires-Dist: peft; extra == "test"
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
+| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
 SGLang is a fast serving framework for large language models and vision language models.
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -278,7 +278,7 @@ The core features include:
 - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
-- **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
+- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 ## News
 - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.1 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
-- To enable DeepSeek MLA acceleration, add `--enable-mla`.
 - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
 - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```

{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,19 @@
 sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
 sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
-sglang/bench_latency.py,sha256=EvmXpaREU-g25OTcOUTgAUPmA-txfnyjaqY-4hlq97w,16925
-sglang/bench_serving.py,sha256=6OM5JIDuoxJDg-VLE4ijGGcS8-6ViaidV05lIrZmSzo,36239
+sglang/bench_latency.py,sha256=bA50iUYOxEnLjzY2S4AgwxtSAqujUbGfQFwbLZj5XNc,17160
+sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
+sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
 sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
-sglang/global_config.py,sha256=KWpXd4OCCWW2TRQo-dShvLs4jb15ej9Ejhxr_wggzBg,1535
+sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
 sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
 sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
 sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
-sglang/version.py,sha256=r4xAFihOf72W9TD-lpMi6ntWSTKTP2SlzKP1ytkjRbI,22
+sglang/version.py,sha256=U9F0UlFDynnYN5dX-kxehylWCwXo9a6E6W4FfDusfRg,28
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
 sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
 sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
-sglang/lang/interpreter.py,sha256=M42SuOnijFaHWOe3Qyi-bNanRt-mYhSDa1wWn1J42Hw,30324
+sglang/lang/interpreter.py,sha256=rOquFbMzxry7IItZlAn5TwtQfxMy718JPxOkiXO-yrg,30234
 sglang/lang/ir.py,sha256=W3UfZikcGeT86PDDjDjw-yNzrKY2e2UYO4DTatMCfm0,17704
 sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
 sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,82 +26,83 @@ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bE
 sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
 sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
 sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
-sglang/srt/server.py,sha256=FNmTpX7E9fVWj_NFzp4AtE5ODaA_rg5Xm8uZ0FB0X4o,20041
-sglang/srt/server_args.py,sha256=5OHH3gaO1s5Y2UQw2_FnFxwxrsqnUQ_WNqP1R1IWUAA,21877
-sglang/srt/utils.py,sha256=pckOt7gyQfJaV3-h8FPurWyrPij5_EBUX_Xp7x6y6YM,24229
+sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
+sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
+sglang/srt/utils.py,sha256=8yxiMRttCcfswynkNPWD3yZFNAGFz2P1PzSuxHCBGns,22340
 sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
 sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
 sglang/srt/configs/model_config.py,sha256=OqHrucJQHbH-wxgkGj-Dcx_B888uUGASpLRjz40HaLY,6651
 sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5UOS_4,2070
 sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
-sglang/srt/constrained/fsm_cache.py,sha256=jItSvCu_XrAgltfejwgvdltaiT98-8lJGBe_84cSnTk,2786
+sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
 sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
-sglang/srt/layers/activation.py,sha256=JEXNTgqxoiU4N-gVm4XMjobhft4JKDcMrgTkfpsRUzM,4856
-sglang/srt/layers/attention_backend.py,sha256=39P3iMs7B1iEzCA3EHdqUp3BLafeIVFnFWGzpEhlTRk,18182
+sglang/srt/layers/activation.py,sha256=i3omgj3GdUIZBqJNUjpdJsMc2UM3Lx07FT2J1WICrqA,5171
+sglang/srt/layers/attention_backend.py,sha256=lqMsY4VaOO_szIWoTAinXf1DnP2UsbF32kzvwFySz9w,18119
 sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
-sglang/srt/layers/layernorm.py,sha256=RXuS4UyksatqTF6lSK7VYyEiUEnBiNIBlEn8q4w84UA,3404
+sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
 sglang/srt/layers/logits_processor.py,sha256=Js2qSk1Z3uPL2cYO1ARai51f2i8OedV3qdwByQVSJtI,12439
 sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
 sglang/srt/layers/radix_attention.py,sha256=EcVO0fUSmgvE_9R-MlpgJq0O_uT8ACuHzbMi19bANYc,1874
-sglang/srt/layers/sampler.py,sha256=1BKsZbSLBGFVtTJo1LsThuoRjOSOnsL1AiwFxJNIXRs,5800
+sglang/srt/layers/sampler.py,sha256=Y0o1bndTGRD713fHMbN5-LRUiyneBkb7bH_QlkkeqSs,3836
 sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJNLZlE,2703
 sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
 sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
-sglang/srt/layers/fused_moe/layer.py,sha256=GT3r2UPx_PAufJd0SUMOXyh76ymAeYDubd0SM0H71bo,20977
+sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
 sglang/srt/layers/triton_attention/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
 sglang/srt/layers/triton_attention/extend_attention.py,sha256=XTUTMrE-5jfMEufQUifZ-8NJQABSPcF47qhnNT5Z1iI,11050
 sglang/srt/layers/triton_attention/prefill_attention.py,sha256=QkXPcT02c13zha2M4mBm2S5dh_sS-Gc4FkkrcywRqvc,5377
 sglang/srt/lora/lora.py,sha256=ksj866lgDul6zxO30Jm7Nrjv-mFAMrzdvP8sez3Pl6U,14938
 sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
-sglang/srt/lora/lora_manager.py,sha256=Q7rk1SMEZ75wda68rAZDGVyX_o8ZdIW2I5Fo_llaqHs,9475
+sglang/srt/lora/lora_manager.py,sha256=7J7cGmyy1Ph4HCvLdM-ViAizAbV1snZqD-S7JLWXasI,9561
 sglang/srt/managers/controller_multi.py,sha256=KolZDso2WqH1ZhQw9p1eTmlFRgo4bcvzBxE44_sNE_o,6300
 sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8HXyMoxJ9sRA,4955
 sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
-sglang/srt/managers/io_struct.py,sha256=bqmL3NDPLqOn6Au3WLF0NOe8Dh7ECMN7BTHCkEZ_Edk,11247
-sglang/srt/managers/policy_scheduler.py,sha256=tiBUi2GJU5eQEBK6HfsO1_YjWtFkougo40954DIp4dM,13026
-sglang/srt/managers/schedule_batch.py,sha256=QfixWzh7ks60eYE52mZHfUseXqcb89h4ZO1Aur3weLU,27340
+sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
+sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
+sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
 sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
-sglang/srt/managers/tp_worker.py,sha256=Zbl_tFUAsD6Qv1fUEJCn_jyUc3JjDm33yI3Nmu1HY8w,39174
+sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
 sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
 sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
 sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
 sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
 sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
-sglang/srt/model_executor/cuda_graph_runner.py,sha256=LngmwtBcvobJ_9G8lD966SihjmMJlgMgHe_ZogK1kDg,10090
+sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
 sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
-sglang/srt/model_executor/model_runner.py,sha256=7jBSCdZxyDLWMOdwv1vRa7Oue-xbp8lA6I11ZPKFdAc,23457
+sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
 sglang/srt/models/baichuan.py,sha256=NrG1rMJXhemkrUCEf8xKOSDQVsOD-nN8RQz6MWHOg84,15124
 sglang/srt/models/chatglm.py,sha256=KwxLHBEvK02McXDvBS0gnRxfIvOAu2QP7lgibrj9Nbc,13371
 sglang/srt/models/commandr.py,sha256=2rAXRZRb4PkJZ4NWEqP_rIgsjxbdZyHpuoMOarqTWzQ,14163
 sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
 sglang/srt/models/deepseek.py,sha256=7UJgde1EV9ey6d-CKRcEyTKh1_WhZdatpZiltIuqpik,16006
-sglang/srt/models/deepseek_v2.py,sha256=3D9WtPvVOu8U40x_KOksnmWBLmLIcgtV958go8NSj5Q,28307
+sglang/srt/models/deepseek_v2.py,sha256=1J0pt1jZRcBBGYbgt1wGiuxPcrdpfTEUEaGFqju6TVA,28431
 sglang/srt/models/exaone.py,sha256=3I5ZoiLotf7U-8c9QJRubpgf6JDx9I_z-ViXQlCC-x8,13087
 sglang/srt/models/gemma.py,sha256=GkwgGFHgGlXgBZN7s7Wooz5tMyCp1YtgLahU2NOo66M,12273
 sglang/srt/models/gemma2.py,sha256=sFfCNEm0_OOWElRSTDuroRv8wNMX8v_81Uko9m546KA,14923
 sglang/srt/models/gpt_bigcode.py,sha256=kzHYogeGXZF4KHpkXA-RGqvs016mA-6klWxD2QJTi9E,10195
 sglang/srt/models/grok.py,sha256=6I4OwQwNyAbh5GF24_SRm12XYBvM9iGWB-T4TSTJ0wU,14929
 sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
-sglang/srt/models/llama.py,sha256=tjdjlIxJr31vgbzGBP_el9RgYxw1kzvmqnVinnTVVUw,15259
-sglang/srt/models/llama_classification.py,sha256=A2ABTUD5u4XoWv1dsIPU7wcCQP3jhbDJblMhLgaiFBA,3402
+sglang/srt/models/llama.py,sha256=nbJwRcG9DnurVNSGLKJjnmBmTXP1_5WZpudth_0PVpw,15216
+sglang/srt/models/llama_classification.py,sha256=HF-69J9qIYdfX0R5wEtIgvafMzprKcXdvF3W_orl_kA,3394
 sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
 sglang/srt/models/llava.py,sha256=O4XGdl70Hh4tM_OHapFGHbReC82mbe9xLw6GELKWKhU,24881
 sglang/srt/models/llavavid.py,sha256=ou5uIuskBoBo0lXvqFFfDLBYYVfehx27n-Lu8X9gpLs,11992
 sglang/srt/models/minicpm.py,sha256=ioqCsTCE_oF8xqGF5fm5cK9dclK5Y0EQ1UJfyteIDDo,13825
-sglang/srt/models/minicpm3.py,sha256=S7bNeCAsfvL44Vn350KLaqX674SCb4CpUuDnhjLjr3U,25113
+sglang/srt/models/minicpm3.py,sha256=McPWyy2fQqfHUhi9Nk36rkvvPAS8RmLOY7Vh4ah5c1w,25216
 sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
 sglang/srt/models/mixtral.py,sha256=oRC7mKBrPJhvzkWSabrbeQQQac-jtF4EV6H2Sgjc5JY,13897
 sglang/srt/models/mixtral_quant.py,sha256=wMACJq78OTWj7HlqPDRNEh8cjrVAjKqJEsOG3CO5xow,14072
+sglang/srt/models/olmoe.py,sha256=d0ECpU-IXXwGYg9tkVeMARUbqVcqEnWfpH3rrNiGKA0,15336
 sglang/srt/models/qwen.py,sha256=nqSRzkiZzpRVG6WGQ1MBUclQnXyw8jlvoOq-euM8j5s,9954
 sglang/srt/models/qwen2.py,sha256=9_M-VkHN1_T1XN-gsl_L636QMQ9BLF2WqvTcx_1L6aw,12432
 sglang/srt/models/qwen2_moe.py,sha256=s7b5XnSvsBYtZZUkjPp442m59CqPJ3HxGUIwXBVWsXw,17153
 sglang/srt/models/stablelm.py,sha256=30ngpc0Xq3VxzXJlf6svP1oax8Q3krMJkxM8PVKtZWU,11359
-sglang/srt/models/xverse.py,sha256=luhp_90ZNkTpXHDCURO4MZBy1vbvHTVCwSe4PYYLWBs,13701
-sglang/srt/models/xverse_moe.py,sha256=YR--WZ33G7XEMsS7ZJl1cQ62Q8PDo9gWqpvJBY_cb-M,15886
+sglang/srt/models/xverse.py,sha256=L3g32-je_7JmzF2-hztaIVshHYCIv7jOM3oFs-fb2MY,13658
+sglang/srt/models/xverse_moe.py,sha256=CgDD9cR83UVfTsPU6WcbHVYBrkYKv_kTdwncTIx7Q7U,15842
 sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
 sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
 sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
-sglang/srt/sampling/sampling_batch_info.py,sha256=vkwy59Jt51FESYukmwDKwPbCM45WMb16dx_408B3oqc,7900
+sglang/srt/sampling/sampling_batch_info.py,sha256=GewqyxCrW2PFwuzGHaCR59Pvw6j0n2dKGrlJWYQWwW4,6149
 sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
 sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
 sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
@@ -108,7 +110,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq
 sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
 sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
 sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
-sglang/test/few_shot_gsm8k.py,sha256=uSHEPvUFbAgWKtaqxkhBpQrQV_SlTk0HN9FhjNLpL4g,3731
+sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
 sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
 sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
 sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
@@ -120,10 +122,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
 sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
 sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
 sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
-sglang/test/test_utils.py,sha256=iBs07MBFxOidipTG1-s2hrCvcURFJVXo7gg10pzAQX8,17168
+sglang/test/test_utils.py,sha256=dsHRd1xLzcjlarxUnDIz2XEHfut7HvqVPwx2Fn7vf10,17179
 sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
-sglang-0.3.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sglang-0.3.1.dist-info/METADATA,sha256=QKZQ7PjuK22x_QlQy1LqPX6y4zLgJJ9FPoNNSkw3cEk,38125
-sglang-0.3.1.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
-sglang-0.3.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.3.1.dist-info/RECORD,,
+sglang-0.3.1.post2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.3.1.post2.dist-info/METADATA,sha256=WxMy8Ur_rjPxqVOoWSFoM3eBHWt0cKGyrtwOUfWL-Vc,38114
+sglang-0.3.1.post2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+sglang-0.3.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.3.1.post2.dist-info/RECORD,,

{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (74.1.2)
+Generator: setuptools (75.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.3.1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl

sglang 0.3.1py3-none-any.whl → 0.3.1.post2py3-none-any.whl