PyPI - sglang - Versions diffs - 0.3.1.post1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl - Mend

sglang 0.3.1.post1py3-none-any.whl → 0.3.1.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

sglang/bench_latency.py +3 -1
sglang/bench_server_latency.py +187 -0
sglang/bench_serving.py +1 -1
sglang/srt/layers/activation.py +6 -3
sglang/srt/layers/layernorm.py +10 -7
sglang/srt/layers/sampler.py +9 -2
sglang/srt/managers/io_struct.py +3 -0
sglang/srt/managers/policy_scheduler.py +49 -93
sglang/srt/managers/schedule_batch.py +1 -1
sglang/srt/managers/tp_worker.py +11 -6
sglang/srt/model_executor/cuda_graph_runner.py +15 -14
sglang/srt/model_executor/model_runner.py +13 -5
sglang/srt/models/deepseek_v2.py +2 -2
sglang/srt/models/llama.py +1 -3
sglang/srt/models/llama_classification.py +2 -3
sglang/srt/models/minicpm3.py +2 -2
sglang/srt/models/xverse.py +1 -3
sglang/srt/models/xverse_moe.py +1 -4
sglang/srt/server_args.py +17 -21
sglang/test/few_shot_gsm8k.py +8 -2
sglang/test/test_utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/METADATA +4 -5
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/RECORD +27 -26
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """
-Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
+Benchmark the latency of running a single static batch.
+This script does not launch a server and uses the low-level APIs.
+It accepts arguments similar to those of launch_server.py.
 # Usage (latency test)
 ## with dummy weights:

sglang/bench_server_latency.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""
+Benchmark the latency of serving a single batch with a real server.
+This script launches a server and uses the HTTP interface.
+It accepts arguments similar to those of launch_server.py.
+Usage:
+python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+"""
+import argparse
+import dataclasses
+import itertools
+import json
+import multiprocessing
+import os
+import time
+from typing import Tuple
+import numpy as np
+import requests
+from sglang.srt.server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_child_process
+@dataclasses.dataclass
+class BenchArgs:
+    run_name: str = "default"
+    batch_size: Tuple[int] = (1,)
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (16,)
+    result_filename: str = "result.jsonl"
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to case the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_child_process(os.getpid(), including_parent=False)
+def launch_server_process(server_args: ServerArgs):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = 600
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            response = requests.get(f"{base_url}/v1/models", headers=headers)
+            if response.status_code == 200:
+                return proc, base_url
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError("Server failed to start within the timeout period.")
+def run_one_case(
+    url: str,
+    batch_size: int,
+    input_len: int,
+    output_len: int,
+    run_name: str,
+    result_filename: str,
+):
+    input_ids = [
+        [int(x) for x in np.random.randint(0, high=16384, size=(input_len,))]
+        for _ in range(batch_size)
+    ]
+    tic = time.time()
+    response = requests.post(
+        url + "/generate",
+        json={
+            "input_ids": input_ids,
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": output_len,
+                "ignore_eos": True,
+            },
+        },
+    )
+    latency = time.time() - tic
+    _ = response.json()
+    output_throughput = batch_size * output_len / latency
+    overall_throughput = batch_size * (input_len + output_len) / latency
+    print(f"batch size: {batch_size}")
+    print(f"latency: {latency:.2f} s")
+    print(f"output throughput: {output_throughput:.2f} token/s")
+    print(f"(input + output) throughput: {overall_throughput:.2f} token/s")
+    if result_filename:
+        with open(result_filename, "a") as fout:
+            res = {
+                "run_name": run_name,
+                "batch_size": batch_size,
+                "input_len": input_len,
+                "output_len": output_len,
+                "latency": round(latency, 4),
+                "output_throughput": round(output_throughput, 2),
+                "overall_throughput": round(overall_throughput, 2),
+            }
+            fout.write(json.dumps(res) + "\n")
+def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
+    proc, base_url = launch_server_process(server_args)
+    # warmup
+    run_one_case(
+        base_url,
+        batch_size=16,
+        input_len=1024,
+        output_len=16,
+        run_name="",
+        result_filename="",
+    )
+    # benchmark
+    try:
+        for bs, il, ol in itertools.product(
+            bench_args.batch_size, bench_args.input_len, bench_args.output_len
+        ):
+            run_one_case(
+                base_url,
+                bs,
+                il,
+                ol,
+                bench_args.run_name,
+                bench_args.result_filename,
+            )
+    finally:
+        kill_child_process(proc.pid)
+    print(f"\nResults are saved to {bench_args.result_filename}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    # For this script, model-path is not required
+    assert (
+        parser._actions[1].option_strings[0] == "--model-path"
+    ), "options changed, this code need to be updated"
+    parser._actions[1].required = False
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+    run_benchmark(server_args, bench_args)

sglang/bench_serving.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
 """
-Benchmark online serving.
+Benchmark online serving with dynamic requests.
 Usage:
 python3 -m sglang.bench_serving --backend sglang --num-prompt 10

sglang/srt/layers/activation.py CHANGED Viewed

@@ -19,7 +19,12 @@ from typing import Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+from sglang.srt.utils import is_hip
+if not is_hip():
+    from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 from vllm.distributed import (
     divide,
     get_tensor_model_parallel_rank,
@@ -29,8 +34,6 @@ from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
-from sglang.srt.utils import is_hip
 logger = logging.getLogger(__name__)

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -20,16 +20,19 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from flashinfer.norm import (
-    fused_add_rmsnorm,
-    gemma_fused_add_rmsnorm,
-    gemma_rmsnorm,
-    rmsnorm,
-)
-from vllm.model_executor.custom_op import CustomOp
 from sglang.srt.utils import is_hip
+if not is_hip():
+    from flashinfer.norm import (
+        fused_add_rmsnorm,
+        gemma_fused_add_rmsnorm,
+        gemma_rmsnorm,
+        rmsnorm,
+    )
+from vllm.model_executor.custom_op import CustomOp
 logger = logging.getLogger(__name__)

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -31,8 +31,11 @@ class Sampler(nn.Module):
             logits = logits.next_token_logits
         # Post process logits
+        logits = logits.contiguous()
         logits.div_(sampling_info.temperatures)
-        probs = logits[:] = torch.softmax(logits, dim=-1)
+        probs = torch.softmax(logits, dim=-1)
+        logits = None
+        del logits
         if torch.any(torch.isnan(probs)):
             logger.warning("Detected errors during sampling! NaN in the probability.")
@@ -53,7 +56,11 @@ class Sampler(nn.Module):
                 )
             else:
                 batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
-                    probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps
+                    probs,
+                    uniform_samples,
+                    sampling_info.top_ks,
+                    sampling_info.top_ps,
+                    filter_apply_order="joint",
                 )
             if not torch.all(success):

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -133,6 +133,9 @@ class GenerateReqInput:
                 self.image_data = [None] * num
             elif not isinstance(self.image_data, list):
                 self.image_data = [self.image_data] * num
+            elif isinstance(self.image_data, list):
+                # multi-image with n > 1
+                self.image_data = self.image_data * num
             if self.sampling_params is None:
                 self.sampling_params = [{}] * num

sglang/srt/managers/policy_scheduler.py CHANGED Viewed

@@ -119,19 +119,32 @@ class PrefillAdder:
         self.running_batch = running_batch
         self.new_token_ratio = new_token_ratio
         self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
-        self.rem_total_tokens_ = self.rem_total_tokens
-        self.total_tokens = rem_total_tokens
         self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
         self.rem_chunk_tokens = rem_chunk_tokens
         if self.rem_chunk_tokens is not None:
             self.rem_chunk_tokens -= mixed_with_decode_tokens
+        self.cur_rem_tokens = rem_total_tokens - mixed_with_decode_tokens
         self.req_states = None
         self.can_run_list = []
         self.new_inflight_req = None
         self.log_hit_tokens = 0
         self.log_input_tokens = 0
+        if running_batch is not None:
+            # Pre-remove the tokens which will be occupied by the running requests
+            self.rem_total_tokens -= sum(
+                [
+                    min(
+                        (r.sampling_params.max_new_tokens - len(r.output_ids)),
+                        CLIP_MAX_NEW_TOKENS,
+                    )
+                    * self.new_token_ratio
+                    for r in running_batch.reqs
+                ]
+            )
     def no_remaining_tokens(self):
         return (
             self.rem_total_tokens <= 0
@@ -141,31 +154,14 @@ class PrefillAdder:
                 if self.rem_chunk_tokens is not None
                 else False
             )
-        )
-    def remove_running_tokens(self, running_batch: ScheduleBatch):
-        self.rem_total_tokens -= sum(
-            [
-                min(
-                    (r.sampling_params.max_new_tokens - len(r.output_ids)),
-                    CLIP_MAX_NEW_TOKENS,
-                )
-                * self.new_token_ratio
-                for r in running_batch.reqs
-            ]
-        )
-        self.rem_total_tokens_ -= sum(
-            [
-                r.sampling_params.max_new_tokens - len(r.output_ids)
-                for r in running_batch.reqs
-            ]
+            or self.cur_rem_tokens <= 0
         )
     def _prefill_one_req(
         self, prefix_len: int, extend_input_len: int, max_new_tokens: int
     ):
         self.rem_total_tokens -= extend_input_len + max_new_tokens
-        self.rem_total_tokens_ -= extend_input_len + max_new_tokens
+        self.cur_rem_tokens -= extend_input_len
         self.rem_input_tokens -= extend_input_len
         if self.rem_chunk_tokens is not None:
             self.rem_chunk_tokens -= extend_input_len
@@ -173,29 +169,7 @@ class PrefillAdder:
         self.log_hit_tokens += prefix_len
         self.log_input_tokens += extend_input_len
-    def add_inflight_req_ignore_eos(self, req: Req):
-        truncated = req.extend_input_len > self.rem_chunk_tokens
-        req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
-        req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
-        self.can_run_list.append(req)
-        self._prefill_one_req(
-            0,
-            req.extend_input_len,
-            (
-                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
-                if not truncated
-                else 0
-            ),
-        )
-        # Return if chunked prefill not finished
-        return req if truncated else None
     def add_inflight_req(self, req: Req):
-        if req.sampling_params.ignore_eos:
-            return self.add_inflight_req_ignore_eos(req)
         truncated = req.extend_input_len > self.rem_chunk_tokens
         req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
         req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
@@ -225,7 +199,7 @@ class PrefillAdder:
             self.rem_total_tokens += delta
     def add_one_req_ignore_eos(self, req: Req):
-        def get_req_state(r):
+        def add_req_state(r, insert_sort=False):
             new_token_ratio = (
                 1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
             )
@@ -235,56 +209,38 @@ class PrefillAdder:
             tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
             if tokens_left > 0:
-                return (tokens_left, tokens_occupied)
-            return None
-        # Quick Check
-        can_run = False
-        if (
-            req.extend_input_len + req.sampling_params.max_new_tokens
-            <= self.rem_total_tokens
-        ):
-            can_run = True
-        if not can_run:
-            if self.req_states is None:
-                self.req_states = []
-                if self.running_batch is not None:
-                    for r in self.running_batch.reqs:
-                        state = get_req_state(r)
-                        if state is not None:
-                            self.req_states.append(state)
-                for r in self.can_run_list:
-                    state = get_req_state(r)
-                    if state is not None:
-                        self.req_states.append(state)
-                state = get_req_state(req)
-                if state is not None:
-                    self.req_states.append(state)
-                self.req_states.sort(key=lambda x: x[0])
-            else:
-                state = get_req_state(req)
-                if state is not None:
-                    for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
-                        if tokens_left >= state[0]:
-                            self.req_states.insert(i, state)
+                if not insert_sort:
+                    self.req_states.append((tokens_left, tokens_occupied))
+                else:
+                    for i in range(len(self.req_states)):
+                        if tokens_left <= self.req_states[i][0]:
                             break
-                    else:
-                        self.req_states.append(state)
-            tokens_freed = 0
-            for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
-                decode_steps = (
-                    self.req_states[i + 1][0]
-                    if i + 1 < len(self.req_states)
-                    else tokens_left
-                )
-                bs = len(self.req_states) - i
-                if self.total_tokens + tokens_freed - decode_steps * bs <= 0:
-                    return False
-                tokens_freed += tokens_occupied
+                    self.req_states.insert(i, (tokens_left, tokens_occupied))
+        if self.req_states is None:
+            self.req_states = []
+            add_req_state(req)
+            if self.running_batch is not None:
+                for r in self.running_batch.reqs:
+                    add_req_state(r)
+            for r in self.can_run_list:
+                add_req_state(r)
+            self.req_states.sort(key=lambda x: x[0])
+        else:
+            add_req_state(req, insert_sort=True)
+        cur_rem_tokens = self.cur_rem_tokens - len(req.origin_input_ids)
+        tokens_freed = 0
+        for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
+            decode_steps = (
+                self.req_states[i + 1][0]
+                if i + 1 < len(self.req_states)
+                else tokens_left
+            )
+            bs = len(self.req_states) - i
+            if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
+                return False
+            tokens_freed += tokens_occupied
         if req.extend_input_len <= self.rem_chunk_tokens:
             self.can_run_list.append(req)

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -40,7 +40,7 @@ global_server_args_dict = {
     "attention_backend": ServerArgs.attention_backend,
     "sampling_backend": ServerArgs.sampling_backend,
     "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
-    "enable_mla": ServerArgs.enable_mla,
+    "disable_mla": ServerArgs.disable_mla,
     "torchao_config": ServerArgs.torchao_config,
 }

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -445,9 +445,6 @@ class ModelTpServer:
             num_mixed_running,
         )
-        if self.running_batch is not None:
-            adder.remove_running_tokens(self.running_batch)
         has_inflight = self.current_inflight_req is not None
         if self.current_inflight_req is not None:
             self.current_inflight_req.init_next_round_input(
@@ -465,9 +462,6 @@ class ModelTpServer:
             )
         for req in self.waiting_queue:
-            if adder.no_remaining_tokens():
-                break
-            req.init_next_round_input(None if prefix_computed else self.tree_cache)
             if (
                 self.lora_paths is not None
                 and len(
@@ -478,6 +472,10 @@ class ModelTpServer:
                 > self.max_loras_per_batch
             ):
                 break
+            if adder.no_remaining_tokens():
+                break
+            req.init_next_round_input(None if prefix_computed else self.tree_cache)
             res = adder.add_one_req(req)
             if (
                 not res
@@ -507,6 +505,11 @@ class ModelTpServer:
             else:
                 tree_cache_hit_rate = 0.0
+            num_used = self.max_total_num_tokens - (
+                self.token_to_kv_pool.available_size()
+                + self.tree_cache.evictable_size()
+            )
             if num_mixed_running > 0:
                 logger.info(
                     f"Prefill batch"
@@ -515,6 +518,7 @@ class ModelTpServer:
                     f"#new-token: {adder.log_input_tokens}, "
                     f"#cached-token: {adder.log_hit_tokens}, "
                     f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
+                    f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
                     f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
                 )
             else:
@@ -524,6 +528,7 @@ class ModelTpServer:
                     f"#new-token: {adder.log_input_tokens}, "
                     f"#cached-token: {adder.log_hit_tokens}, "
                     f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
+                    f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
                     f"#running-req: {running_bs}, "
                     f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
                 )

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -108,6 +108,10 @@ class CudaGraphRunner:
             self.capture_bs = list(range(1, 32)) + [64, 128]
         else:
             self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+        self.capture_bs = [
+            bs for bs in self.capture_bs if bs <= model_runner.req_to_token_pool.size
+        ]
         self.compile_bs = (
             [
                 bs
@@ -118,21 +122,8 @@ class CudaGraphRunner:
             else []
         )
-        # Common inputs
-        self.max_bs = max(self.capture_bs)
-        self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda")
-        self.req_pool_indices = torch.zeros(
-            (self.max_bs,), dtype=torch.int32, device="cuda"
-        )
-        self.seq_lens = torch.ones((self.max_bs,), dtype=torch.int32, device="cuda")
-        self.position_ids_offsets = torch.ones(
-            (self.max_bs,), dtype=torch.int32, device="cuda"
-        )
-        self.out_cache_loc = torch.zeros(
-            (self.max_bs,), dtype=torch.int32, device="cuda"
-        )
         # Attention backend
+        self.max_bs = max(self.capture_bs)
         self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs)
         self.seq_len_fill_value = (
             self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
@@ -141,6 +132,16 @@ class CudaGraphRunner:
         if self.use_torch_compile:
             set_torch_compile_config()
+        # Common inputs
+        with torch.device("cuda"):
+            self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32)
+            self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
+            self.seq_lens = torch.full(
+                (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
+            )
+            self.position_ids_offsets = torch.ones((self.max_bs,), dtype=torch.int32)
+            self.out_cache_loc = torch.zeros((self.max_bs,), dtype=torch.int32)
         # Capture
         try:
             self.capture()

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -86,12 +86,20 @@ class ModelRunner:
         self.is_multimodal_model = is_multimodal_model(
             self.model_config.hf_config.architectures
         )
+        if (
+            self.model_config.attention_arch == AttentionArch.MLA
+            and not self.server_args.disable_mla
+        ):
+            logger.info("MLA optimization is tunred on. Use triton backend.")
+            self.server_args.attention_backend = "triton"
         global_server_args_dict.update(
             {
                 "attention_backend": server_args.attention_backend,
                 "sampling_backend": server_args.sampling_backend,
                 "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
-                "enable_mla": server_args.enable_mla,
+                "disable_mla": server_args.disable_mla,
                 "torchao_config": server_args.torchao_config,
             }
         )
@@ -329,7 +337,7 @@ class ModelRunner:
         )
         if (
             self.model_config.attention_arch == AttentionArch.MLA
-            and self.server_args.enable_mla
+            and not self.server_args.disable_mla
         ):
             cell_size = (
                 (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
@@ -392,12 +400,12 @@ class ModelRunner:
             )
         self.req_to_token_pool = ReqToTokenPool(
-            max_num_reqs,
-            self.model_config.context_len + 8,
+            max_num_reqs + 1,
+            self.model_config.context_len + 4,
         )
         if (
             self.model_config.attention_arch == AttentionArch.MLA
-            and self.server_args.enable_mla
+            and not self.server_args.disable_mla
         ):
             self.token_to_kv_pool = MLATokenToKVPool(
                 self.max_total_num_tokens,

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -507,7 +507,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        if global_server_args_dict["enable_mla"]:
+        if not global_server_args_dict["disable_mla"]:
             self.self_attn = DeepseekV2AttentionMLA(
                 config=config,
                 hidden_size=self.hidden_size,
@@ -732,7 +732,7 @@ class DeepseekV2ForCausalLM(nn.Module):
                     )
                     weight_loader(param, loaded_weight)
-        if global_server_args_dict["enable_mla"]:
+        if not global_server_args_dict["disable_mla"]:
             for layer_id in range(self.config.num_hidden_layers):
                 self_attn = self.model.layers[layer_id].self_attn
                 w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(

sglang/srt/models/llama.py CHANGED Viewed

@@ -305,8 +305,6 @@ class LlamaForCausalLM(nn.Module):
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.param_dict = dict(self.named_parameters())
     @torch.no_grad()
     def forward(
         self,
@@ -374,7 +372,7 @@ class LlamaForCausalLM(nn.Module):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
-        params_dict = self.param_dict
+        params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name or "projector" in name:

sglang/srt/models/llama_classification.py CHANGED Viewed

@@ -36,6 +36,7 @@ class LlamaForClassification(nn.Module):
     ) -> None:
         super().__init__()
         self.config = config
+        self.torchao_config = None
         self.quant_config = quant_config
         self.model = LlamaModel(config, quant_config=quant_config)
@@ -44,8 +45,6 @@ class LlamaForClassification(nn.Module):
         )
         self.eos_token_id = config.eos_token_id
-        self.param_dict = dict(self.named_parameters())
     @torch.no_grad()
     def forward(
         self,
@@ -77,7 +76,7 @@ class LlamaForClassification(nn.Module):
         return logits_output
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        params_dict = self.param_dict
+        params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "classification_head" in name:

sglang/srt/models/minicpm3.py CHANGED Viewed

@@ -419,7 +419,7 @@ class MiniCPM3DecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        if global_server_args_dict["enable_mla"]:
+        if not global_server_args_dict["disable_mla"]:
             self.self_attn = MiniCPM3AttentionMLA(
                 config=config,
                 hidden_size=self.hidden_size,
@@ -653,7 +653,7 @@ class MiniCPM3ForCausalLM(nn.Module):
                     )
                     weight_loader(param, loaded_weight)
-        if global_server_args_dict["enable_mla"]:
+        if not global_server_args_dict["disable_mla"]:
             for layer_id in range(self.config.num_hidden_layers):
                 self_attn = self.model.layers[layer_id].self_attn
                 w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(

sglang/srt/models/xverse.py CHANGED Viewed

@@ -307,8 +307,6 @@ class XverseForCausalLM(nn.Module):
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
-        self.param_dict = dict(self.named_parameters())
     @torch.no_grad()
     def forward(
         self,
@@ -333,7 +331,7 @@ class XverseForCausalLM(nn.Module):
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
-        params_dict = self.param_dict
+        params_dict = dict(self.named_parameters())
         def load_weights_per_param(name, loaded_weight):
             if "rotary_emb.inv_freq" in name or "projector" in name:

sglang/srt/models/xverse_moe.py CHANGED Viewed

@@ -383,8 +383,6 @@ class XverseMoeForCausalLM(nn.Module):
         )
         self.logits_processor = LogitsProcessor(config)
-        self.param_dict = dict(self.named_parameters())
     @torch.no_grad()
     def forward(
         self,
@@ -406,8 +404,7 @@ class XverseMoeForCausalLM(nn.Module):
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
-        params_dict = self.param_dict
+        params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:

sglang/srt/server_args.py CHANGED Viewed

@@ -26,17 +26,6 @@ from sglang.srt.utils import is_hip
 logger = logging.getLogger(__name__)
-class LoRAPathAction(argparse.Action):
-    def __call__(self, parser, namespace, values, option_string=None):
-        setattr(namespace, self.dest, {})
-        for lora_path in values:
-            if "=" in lora_path:
-                name, path = lora_path.split("=", 1)
-                getattr(namespace, self.dest)[name] = path
-            else:
-                getattr(namespace, self.dest)[lora_path] = lora_path
 @dataclasses.dataclass
 class ServerArgs:
     # Model and tokenizer
@@ -108,12 +97,12 @@ class ServerArgs:
     disable_cuda_graph_padding: bool = False
     disable_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
+    disable_mla: bool = False
     enable_mixed_chunk: bool = False
     enable_torch_compile: bool = False
     max_torch_compile_bs: int = 32
     torchao_config: str = ""
     enable_p2p_check: bool = False
-    enable_mla: bool = False
     triton_attention_reduce_in_fp32: bool = False
     # LoRA
@@ -173,10 +162,6 @@ class ServerArgs:
             self.sampling_backend = "pytorch"
         # Default kernel backends
-        if self.enable_mla:
-            logger.info("MLA optimization is tunred on. Use triton backend.")
-            self.attention_backend = "triton"
         if self.attention_backend is None:
             self.attention_backend = "flashinfer"
@@ -514,6 +499,11 @@ class ServerArgs:
             default=False,
             help="Disable the custom all-reduce kernel and fall back to NCCL.",
         )
+        parser.add_argument(
+            "--disable-mla",
+            action="store_true",
+            help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
+        )
         parser.add_argument(
             "--enable-mixed-chunk",
             action="store_true",
@@ -541,11 +531,6 @@ class ServerArgs:
             action="store_true",
             help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
         )
-        parser.add_argument(
-            "--enable-mla",
-            action="store_true",
-            help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
-        )
         parser.add_argument(
             "--triton-attention-reduce-in-fp32",
             action="store_true",
@@ -623,3 +608,14 @@ class PortArgs:
     controller_port: int
     detokenizer_port: int
     nccl_ports: List[int]
+class LoRAPathAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, {})
+        for lora_path in values:
+            if "=" in lora_path:
+                name, path = lora_path.split("=", 1)
+                getattr(namespace, self.dest)[name] = path
+            else:
+                getattr(namespace, self.dest)[lora_path] = lora_path

sglang/test/few_shot_gsm8k.py CHANGED Viewed

@@ -44,7 +44,7 @@ def get_answer_value(answer_str):
         return INVALID
-def main(args):
+def run_eval(args):
     # Select backend
     set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
@@ -119,6 +119,12 @@ def main(args):
     # Dump results
     dump_state_text("tmp_output_gsm8k.txt", states)
+    return {
+        "accuracy": acc,
+        "latency": latency,
+        "output_throughput": output_throughput,
+    }
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -129,4 +135,4 @@ if __name__ == "__main__":
     parser.add_argument("--host", type=str, default="http://127.0.0.1")
     parser.add_argument("--port", type=int, default=30000)
     args = parser.parse_args()
-    main(args)
+    run_eval(args)

sglang/test/test_utils.py CHANGED Viewed

@@ -22,6 +22,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.utils import kill_child_process
 from sglang.utils import get_exception_traceback
+DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.1.~~post1~~"
1	+ __version__ = "0.3.1.post2"

{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.3.1.post1
+Version: 0.3.1.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -269,7 +269,7 @@ Requires-Dist: peft; extra == "test"
 --------------------------------------------------------------------------------
-| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
+| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
 SGLang is a fast serving framework for large language models and vision language models.
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -278,7 +278,7 @@ The core features include:
 - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
-- **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
+- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 ## News
 - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
-- To enable DeepSeek MLA acceleration, add `--enable-mla`.
 - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
 - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```

{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,14 @@
 sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
 sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
-sglang/bench_latency.py,sha256=CDMrch4QwIyb2DTH2kBIgQ6Q8sGHwtrx3Cz49qZNfpU,17078
-sglang/bench_serving.py,sha256=6OM5JIDuoxJDg-VLE4ijGGcS8-6ViaidV05lIrZmSzo,36239
+sglang/bench_latency.py,sha256=bA50iUYOxEnLjzY2S4AgwxtSAqujUbGfQFwbLZj5XNc,17160
+sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
+sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
 sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
 sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
 sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
 sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
 sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
-sglang/version.py,sha256=83xK6WSmRR5ba-i5fDLUmoJT83Eg_dpsWgwcnsUhMpA,28
+sglang/version.py,sha256=U9F0UlFDynnYN5dX-kxehylWCwXo9a6E6W4FfDusfRg,28
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
 sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -26,7 +27,7 @@ sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19
 sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
 sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
 sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
-sglang/srt/server_args.py,sha256=M1Bm9u2JRsEptne-kw-D-B_29Q-M6V4UpAM7K-JxXAc,23309
+sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
 sglang/srt/utils.py,sha256=8yxiMRttCcfswynkNPWD3yZFNAGFz2P1PzSuxHCBGns,22340
 sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
 sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
@@ -35,14 +36,14 @@ sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5
 sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
 sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
 sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
-sglang/srt/layers/activation.py,sha256=awcwOODYcVdUtC2JxJ1TGsV8Tru0eACKcxYN6cWHbl4,5148
+sglang/srt/layers/activation.py,sha256=i3omgj3GdUIZBqJNUjpdJsMc2UM3Lx07FT2J1WICrqA,5171
 sglang/srt/layers/attention_backend.py,sha256=lqMsY4VaOO_szIWoTAinXf1DnP2UsbF32kzvwFySz9w,18119
 sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
-sglang/srt/layers/layernorm.py,sha256=-9Yph4nnMZYX_Q31MUGAimLajNclHXjgDkswpU2BTos,3694
+sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
 sglang/srt/layers/logits_processor.py,sha256=Js2qSk1Z3uPL2cYO1ARai51f2i8OedV3qdwByQVSJtI,12439
 sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
 sglang/srt/layers/radix_attention.py,sha256=EcVO0fUSmgvE_9R-MlpgJq0O_uT8ACuHzbMi19bANYc,1874
-sglang/srt/layers/sampler.py,sha256=Q4u46oYu66e34rBNzr50VoXO8FM-assYiCoROolq3Zs,3661
+sglang/srt/layers/sampler.py,sha256=Y0o1bndTGRD713fHMbN5-LRUiyneBkb7bH_QlkkeqSs,3836
 sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJNLZlE,2703
 sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
 sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
@@ -56,38 +57,38 @@ sglang/srt/lora/lora_manager.py,sha256=7J7cGmyy1Ph4HCvLdM-ViAizAbV1snZqD-S7JLWXa
 sglang/srt/managers/controller_multi.py,sha256=KolZDso2WqH1ZhQw9p1eTmlFRgo4bcvzBxE44_sNE_o,6300
 sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8HXyMoxJ9sRA,4955
 sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
-sglang/srt/managers/io_struct.py,sha256=bqmL3NDPLqOn6Au3WLF0NOe8Dh7ECMN7BTHCkEZ_Edk,11247
-sglang/srt/managers/policy_scheduler.py,sha256=tiBUi2GJU5eQEBK6HfsO1_YjWtFkougo40954DIp4dM,13026
-sglang/srt/managers/schedule_batch.py,sha256=ppHYK65GP0dtuCEzpSbGm9uAne5rEoRmW8osLknXJpI,27384
+sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
+sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
+sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
 sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
-sglang/srt/managers/tp_worker.py,sha256=4Hhla9rfGYEdQtzGmxlIEqxt_WVkn2dkLLNQZHgpkf0,39270
+sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
 sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
 sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
 sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
 sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
 sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
-sglang/srt/model_executor/cuda_graph_runner.py,sha256=ZeO-8Mg4Tf0iP-L9FXcyhHfNzGWpTPEDGeUoC2lzHTE,10418
+sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
 sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
-sglang/srt/model_executor/model_runner.py,sha256=LoQ7OFVwOiK_BfdpRfitss1TfJ8qrysHgWM-xXu7n2Y,22433
+sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
 sglang/srt/models/baichuan.py,sha256=NrG1rMJXhemkrUCEf8xKOSDQVsOD-nN8RQz6MWHOg84,15124
 sglang/srt/models/chatglm.py,sha256=KwxLHBEvK02McXDvBS0gnRxfIvOAu2QP7lgibrj9Nbc,13371
 sglang/srt/models/commandr.py,sha256=2rAXRZRb4PkJZ4NWEqP_rIgsjxbdZyHpuoMOarqTWzQ,14163
 sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
 sglang/srt/models/deepseek.py,sha256=7UJgde1EV9ey6d-CKRcEyTKh1_WhZdatpZiltIuqpik,16006
-sglang/srt/models/deepseek_v2.py,sha256=bPaGRL8ieBCXKIf-KY7-D9Rus7Qj3VGvvtERzAXAZWs,28421
+sglang/srt/models/deepseek_v2.py,sha256=1J0pt1jZRcBBGYbgt1wGiuxPcrdpfTEUEaGFqju6TVA,28431
 sglang/srt/models/exaone.py,sha256=3I5ZoiLotf7U-8c9QJRubpgf6JDx9I_z-ViXQlCC-x8,13087
 sglang/srt/models/gemma.py,sha256=GkwgGFHgGlXgBZN7s7Wooz5tMyCp1YtgLahU2NOo66M,12273
 sglang/srt/models/gemma2.py,sha256=sFfCNEm0_OOWElRSTDuroRv8wNMX8v_81Uko9m546KA,14923
 sglang/srt/models/gpt_bigcode.py,sha256=kzHYogeGXZF4KHpkXA-RGqvs016mA-6klWxD2QJTi9E,10195
 sglang/srt/models/grok.py,sha256=6I4OwQwNyAbh5GF24_SRm12XYBvM9iGWB-T4TSTJ0wU,14929
 sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
-sglang/srt/models/llama.py,sha256=tjdjlIxJr31vgbzGBP_el9RgYxw1kzvmqnVinnTVVUw,15259
-sglang/srt/models/llama_classification.py,sha256=A2ABTUD5u4XoWv1dsIPU7wcCQP3jhbDJblMhLgaiFBA,3402
+sglang/srt/models/llama.py,sha256=nbJwRcG9DnurVNSGLKJjnmBmTXP1_5WZpudth_0PVpw,15216
+sglang/srt/models/llama_classification.py,sha256=HF-69J9qIYdfX0R5wEtIgvafMzprKcXdvF3W_orl_kA,3394
 sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
 sglang/srt/models/llava.py,sha256=O4XGdl70Hh4tM_OHapFGHbReC82mbe9xLw6GELKWKhU,24881
 sglang/srt/models/llavavid.py,sha256=ou5uIuskBoBo0lXvqFFfDLBYYVfehx27n-Lu8X9gpLs,11992
 sglang/srt/models/minicpm.py,sha256=ioqCsTCE_oF8xqGF5fm5cK9dclK5Y0EQ1UJfyteIDDo,13825
-sglang/srt/models/minicpm3.py,sha256=_C96kO3qGK0KRctXZf8LBR9s0sEW0QXWSGU0Vf6OrI8,25206
+sglang/srt/models/minicpm3.py,sha256=McPWyy2fQqfHUhi9Nk36rkvvPAS8RmLOY7Vh4ah5c1w,25216
 sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
 sglang/srt/models/mixtral.py,sha256=oRC7mKBrPJhvzkWSabrbeQQQac-jtF4EV6H2Sgjc5JY,13897
 sglang/srt/models/mixtral_quant.py,sha256=wMACJq78OTWj7HlqPDRNEh8cjrVAjKqJEsOG3CO5xow,14072
@@ -96,8 +97,8 @@ sglang/srt/models/qwen.py,sha256=nqSRzkiZzpRVG6WGQ1MBUclQnXyw8jlvoOq-euM8j5s,995
 sglang/srt/models/qwen2.py,sha256=9_M-VkHN1_T1XN-gsl_L636QMQ9BLF2WqvTcx_1L6aw,12432
 sglang/srt/models/qwen2_moe.py,sha256=s7b5XnSvsBYtZZUkjPp442m59CqPJ3HxGUIwXBVWsXw,17153
 sglang/srt/models/stablelm.py,sha256=30ngpc0Xq3VxzXJlf6svP1oax8Q3krMJkxM8PVKtZWU,11359
-sglang/srt/models/xverse.py,sha256=luhp_90ZNkTpXHDCURO4MZBy1vbvHTVCwSe4PYYLWBs,13701
-sglang/srt/models/xverse_moe.py,sha256=YR--WZ33G7XEMsS7ZJl1cQ62Q8PDo9gWqpvJBY_cb-M,15886
+sglang/srt/models/xverse.py,sha256=L3g32-je_7JmzF2-hztaIVshHYCIv7jOM3oFs-fb2MY,13658
+sglang/srt/models/xverse_moe.py,sha256=CgDD9cR83UVfTsPU6WcbHVYBrkYKv_kTdwncTIx7Q7U,15842
 sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
 sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
 sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
@@ -109,7 +110,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq
 sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
 sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
 sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
-sglang/test/few_shot_gsm8k.py,sha256=uSHEPvUFbAgWKtaqxkhBpQrQV_SlTk0HN9FhjNLpL4g,3731
+sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
 sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
 sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
 sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
@@ -121,10 +122,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
 sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
 sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
 sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
-sglang/test/test_utils.py,sha256=NLiJqFRWnCeQ-gdCBe0ubNFCsig1CPb1EU-Ay9CtSfU,17109
+sglang/test/test_utils.py,sha256=dsHRd1xLzcjlarxUnDIz2XEHfut7HvqVPwx2Fn7vf10,17179
 sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
-sglang-0.3.1.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sglang-0.3.1.post1.dist-info/METADATA,sha256=zswdq5UTi5aLVmpEyjnc7SzIi60yc4w2hlMhckdxmcU,38137
-sglang-0.3.1.post1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-sglang-0.3.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.3.1.post1.dist-info/RECORD,,
+sglang-0.3.1.post2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.3.1.post2.dist-info/METADATA,sha256=WxMy8Ur_rjPxqVOoWSFoM3eBHWt0cKGyrtwOUfWL-Vc,38114
+sglang-0.3.1.post2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+sglang-0.3.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.3.1.post2.dist-info/RECORD,,

{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.3.1.post1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl

sglang 0.3.1.post1py3-none-any.whl → 0.3.1.post2py3-none-any.whl