PyPI - sglang - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.1.post1__py3-none-any.whl - Mend

sglang 0.4.1py3-none-any.whl → 0.4.1.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

sglang/bench_serving.py +11 -3
sglang/lang/backend/openai.py +10 -0
sglang/srt/constrained/xgrammar_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -14
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +17 -4
sglang/srt/layers/moe/topk.py +14 -0
sglang/srt/layers/quantization/fp8_kernel.py +14 -0
sglang/srt/managers/schedule_policy.py +1 -1
sglang/srt/managers/scheduler.py +11 -14
sglang/srt/managers/tokenizer_manager.py +54 -45
sglang/srt/model_executor/model_runner.py +0 -6
sglang/srt/model_loader/loader.py +22 -11
sglang/srt/models/gemma2.py +19 -0
sglang/srt/models/llama.py +2 -2
sglang/srt/openai_api/adapter.py +19 -0
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/sampling_params.py +9 -2
sglang/srt/server.py +20 -37
sglang/version.py +1 -1
{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/METADATA +4 -4
{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/RECORD +24 -24
{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/LICENSE +0 -0
{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/top_level.txt +0 -0

sglang/bench_serving.py CHANGED Viewed

@@ -897,6 +897,7 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
+    # Limit concurrency
     # From https://github.com/vllm-project/vllm/pull/9390
     semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
@@ -906,6 +907,7 @@ async def benchmark(
         async with semaphore:
             return await request_func(request_func_input=request_func_input, pbar=pbar)
+    # Warmup
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
     test_input = RequestFuncInput(
@@ -924,11 +926,15 @@ async def benchmark(
             f"are correctly specified. Error: {test_output.error}"
         )
     else:
-        requests.post(base_url + "/flush_cache")
         print("Initial test run completed. Starting main benchmark run...")
-    time.sleep(1.5)
+    # Flush cache
+    if "sglang" in backend:
+        requests.post(base_url + "/flush_cache")
+    time.sleep(1.0)
+    # Start profiler
     if profile:
         print("Starting profiler...")
         profile_output = await async_request_profile(
@@ -939,6 +945,7 @@ async def benchmark(
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+    # Run all requests
     benchmark_start_time = time.perf_counter()
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
@@ -959,6 +966,7 @@ async def benchmark(
         )
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    # Stop profiler
     if profile:
         print("Stopping profiler...")
         profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
@@ -968,8 +976,8 @@ async def benchmark(
     if pbar is not None:
         pbar.close()
+    # Compute metrics and print results
     benchmark_duration = time.perf_counter() - benchmark_start_time
     metrics, output_lens = calculate_metrics(
         input_requests=input_requests,
         outputs=outputs,

sglang/lang/backend/openai.py CHANGED Viewed

@@ -366,6 +366,11 @@ class OpenAI(BaseBackend):
 def openai_completion(
     client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
 ):
+    # if "ebnf" is in kwargs, warn and remove
+    if "ebnf" in kwargs:
+        warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
+        del kwargs["ebnf"]
     for attempt in range(retries):
         try:
             if is_chat:
@@ -398,6 +403,11 @@ def openai_completion(
 def openai_completion_stream(
     client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
 ):
+    # if "ebnf" is in kwargs, warn and remove
+    if "ebnf" in kwargs:
+        warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
+        del kwargs["ebnf"]
     for attempt in range(retries):
         try:
             if is_chat:

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -126,6 +126,12 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
                     f"Skip invalid json_schema: json_schema={key_string}, {e=}"
                 )
                 return None
+        elif key_type == "ebnf":
+            try:
+                ctx = self.grammar_compiler.compile_grammar(key_string)
+            except RuntimeError as e:
+                logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
+                return None
         elif key_type == "regex":
             logger.warning(
                 "regex hasn't been supported by xgrammar yet. This is skipped."

sglang/srt/layers/attention/triton_ops/extend_attention.py CHANGED Viewed

@@ -292,27 +292,33 @@ def extend_attention_fwd(
         BLOCK_DPE = 0
     BLOCK_DV = triton.next_power_of_2(Lv)
-    if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
-        if Lq <= 256:
-            BLOCK_M, BLOCK_N = (128, 64)
-        else:
-            BLOCK_M, BLOCK_N = (32, 64)
-    elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
-        if Lq <= 128:
-            BLOCK_M, BLOCK_N = (128, 128)
-        elif Lq <= 256:
-            BLOCK_M, BLOCK_N = (64, 64)
-        else:
-            BLOCK_M, BLOCK_N = (32, 64)
+    if is_hip_:
+        BLOCK_M, BLOCK_N = (64, 64)
+        num_warps = 4
     else:
-        BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
+        if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
+            if Lq <= 256:
+                BLOCK_M, BLOCK_N = (128, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
+            if Lq <= 128:
+                BLOCK_M, BLOCK_N = (128, 128)
+            elif Lq <= 256:
+                BLOCK_M, BLOCK_N = (64, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        else:
+            BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
+        num_warps = 4 if Lk <= 64 else 8
     sm_scale = sm_scale or 1.0 / (Lq**0.5)
     batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
     kv_group_num = q_extend.shape[1] // k_extend.shape[1]
     grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
-    num_warps = 4 if Lk <= 64 else 8
     num_stages = 1
     extra_kargs = {}

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -11,12 +11,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
-from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
 from vllm import _custom_ops as ops
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
-from sglang.srt.utils import direct_register_custom_op, get_device_name
+from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
+not_hip = False
+if not is_hip():
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+    not_hip = True
 logger = logging.getLogger(__name__)
 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
@@ -267,8 +272,14 @@ def moe_align_block_size(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-    # FIXME(zhyncs)
-    if num_experts >= 256:
+    if not_hip and num_experts >= 224:
+        token_cnts_buffer = torch.empty(
+            (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
+        )
+        cumsum_buffer = torch.empty(
+            num_experts + 1, dtype=torch.int32, device=topk_ids.device
+        )
         sgl_moe_align_block_size(
             topk_ids,
             num_experts,
@@ -276,6 +287,8 @@ def moe_align_block_size(
             sorted_ids,
             expert_ids,
             num_tokens_post_pad,
+            token_cnts_buffer,
+            cumsum_buffer,
         )
     else:
         ops.moe_align_block_size(

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 from typing import Callable, Optional
 import torch

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 from typing import List, Tuple
 import torch

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -248,7 +248,7 @@ class PrefillAdder:
         self.can_run_list.append(req)
         self._prefill_one_req(
-            len(req.prefix_indices),
+            0,
             req.extend_input_len,
             (
                 min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -468,9 +468,6 @@ class Scheduler:
                 self.send_to_tokenizer.send_pyobj(
                     UpdateWeightFromDiskReqOutput(success, message)
                 )
-            elif isinstance(recv_req, GetWeightsByNameReqInput):
-                parameter = self.get_weights_by_name(recv_req)
-                self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
             elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
                 success, message = self.init_weights_update_group(recv_req)
                 self.send_to_tokenizer.send_pyobj(
@@ -565,7 +562,7 @@ class Scheduler:
         if req.logprob_start_len == -1:
             # By default, only return the logprobs for output tokens
-            req.logprob_start_len = len(recv_req.input_ids) - 1
+            req.logprob_start_len = len(req.origin_input_ids) - 1
         # Truncate prompts that are too long
         if len(req.origin_input_ids) > self.max_req_input_len:
@@ -589,12 +586,15 @@ class Scheduler:
         if (
             req.sampling_params.json_schema is not None
             or req.sampling_params.regex is not None
+            or req.sampling_params.ebnf is not None
         ):
             assert self.grammar_backend is not None
             if req.sampling_params.json_schema is not None:
                 key = ("json", req.sampling_params.json_schema)
             elif req.sampling_params.regex is not None:
                 key = ("regex", req.sampling_params.regex)
+            elif req.sampling_params.ebnf is not None:
+                key = ("ebnf", req.sampling_params.ebnf)
             req.grammar = self.grammar_backend.get_cached_value(key)
             if not req.grammar:
@@ -629,16 +629,13 @@ class Scheduler:
         self.waiting_queue.append(req)
     def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
-        if isinstance(self.tree_cache, RadixCache):
-            self.tree_cache_metrics["total"] += (
-                adder.log_input_tokens + adder.log_hit_tokens
-            ) / 10**9
-            self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
-            tree_cache_hit_rate = (
-                self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
-            )
-        else:
-            tree_cache_hit_rate = 0.0
+        self.tree_cache_metrics["total"] += (
+            adder.log_input_tokens + adder.log_hit_tokens
+        ) / 10**9
+        self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
+        tree_cache_hit_rate = (
+            self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
+        )
         num_used = self.max_total_num_tokens - (
             self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -22,7 +22,7 @@ import signal
 import sys
 import time
 import uuid
-from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
+from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
 import fastapi
 import uvloop
@@ -173,6 +173,15 @@ class TokenizerManager:
         # Others
         self.gracefully_exit = False
+        self.init_weights_update_group_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_weights_from_distributed_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.get_weights_by_name_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
         # Metrics
         if self.enable_metrics:
@@ -190,8 +199,7 @@ class TokenizerManager:
     ):
         created_time = time.time()
-        if self.to_create_loop:
-            self.create_handle_loop()
+        self.auto_create_handle_loop()
         if isinstance(obj, EmbeddingReqInput) and self.is_generation:
             raise ValueError(
@@ -440,8 +448,7 @@ class TokenizerManager:
         obj: UpdateWeightFromDiskReqInput,
         request: Optional[fastapi.Request] = None,
     ) -> Tuple[bool, str]:
-        if self.to_create_loop:
-            self.create_handle_loop()
+        self.auto_create_handle_loop()
         # default the load format to the server_args
         if obj.load_format is None:
@@ -456,7 +463,7 @@ class TokenizerManager:
     async def _wait_for_model_update_from_disk(
         self, obj: UpdateWeightFromDiskReqInput
-    ) -> Tuple[bool, str, int]:
+    ) -> Tuple[bool, str]:
         self.send_to_scheduler.send_pyobj(obj)
         self.model_update_result = asyncio.Future()
         if self.server_args.dp_size == 1:
@@ -485,15 +492,11 @@ class TokenizerManager:
         obj: InitWeightsUpdateGroupReqInput,
         request: Optional[fastapi.Request] = None,
     ) -> Tuple[bool, str]:
-        if self.to_create_loop:
-            self.create_handle_loop()
-        self.send_to_scheduler.send_pyobj(obj)
-        self.init_weights_update_group_result = asyncio.Future()
+        self.auto_create_handle_loop()
         assert (
             self.server_args.dp_size == 1
         ), "dp_size must be 1 for init parameter update group"
-        result = await self.init_weights_update_group_result
+        result = (await self.init_weights_update_group_communicator(obj))[0]
         return result.success, result.message
     async def update_weights_from_distributed(
@@ -501,44 +504,32 @@ class TokenizerManager:
         obj: UpdateWeightsFromDistributedReqInput,
         request: Optional[fastapi.Request] = None,
     ) -> Tuple[bool, str]:
-        if self.to_create_loop:
-            self.create_handle_loop()
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1
+        ), "dp_size must be for update weights from distributed"
         # This means that weight sync
         # cannot run while requests are in progress.
         async with self.model_update_lock.writer_lock:
-            self.send_to_scheduler.send_pyobj(obj)
-            self.parameter_update_result: Awaitable[
-                UpdateWeightsFromDistributedReqOutput
-            ] = asyncio.Future()
-            assert (
-                self.server_args.dp_size == 1
-            ), "dp_size must be for update weights from distributed"
-            result = await self.parameter_update_result
+            result = (await self.update_weights_from_distributed_communicator(obj))[0]
             return result.success, result.message
     async def get_weights_by_name(
         self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
     ):
-        if self.to_create_loop:
-            self.create_handle_loop()
-        self.send_to_scheduler.send_pyobj(obj)
-        self.get_weights_by_name_result = asyncio.Future()
+        self.auto_create_handle_loop()
+        results = await self.get_weights_by_name_communicator(obj)
+        all_parameters = [r.parameter for r in results]
         if self.server_args.dp_size == 1:
-            result = await self.get_weights_by_name_result
-            return result.parameter
+            return all_parameters[0]
         else:
-            self.get_weights_by_name_tmp = []
-            result = await self.get_weights_by_name_result
-            all_parameters = [r.parameter for r in result]
             return all_parameters
     async def open_session(
         self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
     ):
-        if self.to_create_loop:
-            self.create_handle_loop()
+        self.auto_create_handle_loop()
         session_id = uuid.uuid4().hex
         obj.session_id = session_id
@@ -568,7 +559,7 @@ class TokenizerManager:
         background_tasks.add_task(abort_request)
         return background_tasks
-    def create_handle_loop(self):
+    def auto_create_handle_loop(self):
         if not self.to_create_loop:
             return
@@ -711,21 +702,14 @@ class TokenizerManager:
                 assert (
                     self.server_args.dp_size == 1
                 ), "dp_size must be 1 for init parameter update group"
-                self.init_weights_update_group_result.set_result(recv_obj)
+                self.init_weights_update_group_communicator.handle_recv(recv_obj)
             elif isinstance(recv_obj, UpdateWeightsFromDistributedReqOutput):
                 assert (
                     self.server_args.dp_size == 1
                 ), "dp_size must be 1 for update weights from distributed"
-                self.parameter_update_result.set_result(recv_obj)
+                self.update_weights_from_distributed_communicator.handle_recv(recv_obj)
             elif isinstance(recv_obj, GetWeightsByNameReqOutput):
-                if self.server_args.dp_size == 1:
-                    self.get_weights_by_name_result.set_result(recv_obj)
-                else:
-                    self.get_weights_by_name_tmp.append(recv_obj)
-                    if len(self.get_weights_by_name_tmp) == self.server_args.dp_size:
-                        self.get_weights_by_name_result.set_result(
-                            self.get_weights_by_name_tmp
-                        )
+                self.get_weights_by_name_communicator.handle_recv(recv_obj)
             else:
                 raise ValueError(f"Invalid object: {recv_obj=}")
@@ -809,3 +793,28 @@ class SignalHandler:
             f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
         )
         self.tokenizer_manager.gracefully_exit = True
+T = TypeVar("T")
+class _Communicator(Generic[T]):
+    def __init__(self, sender, fan_out: int):
+        self._sender = sender
+        self._fan_out = fan_out
+        self._result_future: Optional[asyncio.Future] = None
+        self._result_values: Optional[List[T]] = None
+    async def __call__(self, obj):
+        self._sender.send_pyobj(obj)
+        self._result_future = asyncio.Future()
+        self._result_values = []
+        await self._result_future
+        result_values = self._result_values
+        self._result_future = self._result_values = None
+        return result_values
+    def handle_recv(self, recv_obj: T):
+        self._result_values.append(recv_obj)
+        if len(self._result_values) == self._fan_out:
+            self._result_future.set_result(None)

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -95,12 +95,6 @@ class ModelRunner:
         ):
             logger.info("MLA optimization is turned on. Use triton backend.")
             self.server_args.attention_backend = "triton"
-            # FIXME(HandH1998)
-            if (
-                "DeepseekV3ForCausalLM" in self.model_config.hf_config.architectures
-                and not self.server_args.disable_cuda_graph
-            ):
-                self.server_args.disable_cuda_graph = True
         if self.server_args.enable_double_sparsity:
             logger.info(

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -770,6 +770,21 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             quant_state_dict,
         )
+    def _is_8bit_weight_name(self, weight_name: str):
+        quantized_suffix = {".scb", ".weight_format"}
+        return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
+    def _is_4bit_weight_name(self, weight_name: str):
+        quantized_suffix = {
+            "absmax",
+            "quant_map",
+            "nested_absmax",
+            "nested_quant_map",
+            "bitsandbytes",
+        }
+        suffix = weight_name.split(".")[-1]
+        return any(q_suffix in suffix for q_suffix in quantized_suffix)
     def _quantized_8bit_generator(
         self, hf_weights_files, use_safetensors, quant_state_dict
     ) -> Generator:
@@ -779,21 +794,18 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             if not weight_name.lower().endswith(".scb"):
                 continue
-            weight_key = weight_name.lower().replace(".scb", ".qweight")
+            weight_key = weight_name.lower().replace(".scb", ".weight")
             quant_state_dict[weight_key] = weight_tensor
         for weight_name, weight_tensor in self._hf_weight_iter(
             hf_weights_files, use_safetensors
         ):
-            if not weight_name.endswith((".weight", ".bias")):
+            if self._is_8bit_weight_name(weight_name):
                 continue
-            qweight_name = weight_name.replace(".weight", ".qweight")
-            if qweight_name in quant_state_dict:
+            if weight_name in quant_state_dict:
                 set_weight_attrs(weight_tensor, {"load_in_8bit": True})
-                yield qweight_name, weight_tensor
+                yield weight_name, weight_tensor
             else:
                 yield weight_name, weight_tensor
@@ -806,7 +818,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
         temp_state_dict = {}
         for weight_name, weight_tensor in weight_iterator:
-            if weight_name.endswith((".weight", ".bias")):
+            if not self._is_4bit_weight_name(weight_name):
                 continue
             # bitsandbytes library requires
             # weight.quant_state.bitsandbytes__* in CPU
@@ -830,16 +842,15 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             hf_weights_files, use_safetensors
         ):
-            if not weight_name.endswith((".weight", ".bias")):
+            if self._is_4bit_weight_name(weight_name):
                 continue
             if (f"{weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or (
                 f"{weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
             ):
                 quant_state = _parse_quant_state(weight_name, temp_state_dict)
-                weight_name = weight_name.replace(".weight", ".qweight")
                 quant_state_dict[weight_name] = quant_state
-                yield weight_name.replace(".weight", ".qweight"), weight_tensor
+                yield weight_name, weight_tensor
             else:
                 yield weight_name, weight_tensor

sglang/srt/models/gemma2.py CHANGED Viewed

@@ -307,6 +307,25 @@ class Gemma2Model(nn.Module):
 class Gemma2ForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

sglang/srt/models/llama.py CHANGED Viewed

@@ -325,8 +325,8 @@ class LlamaForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = LlamaModel(config, quant_config=quant_config)
-        # Llama 3.2 1B Insturct set tie_word_embeddings to True
-        # Llama 3.1 8B Insturct set tie_word_embeddings to False
+        # Llama 3.2 1B Instruct set tie_word_embeddings to True
+        # Llama 3.1 8B Instruct set tie_word_embeddings to False
         if self.config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -517,6 +517,7 @@ def v1_generate_request(
                 "repetition_penalty": request.repetition_penalty,
                 "regex": request.regex,
                 "json_schema": request.json_schema,
+                "ebnf": request.ebnf,
                 "n": request.n,
                 "no_stop_trim": request.no_stop_trim,
                 "ignore_eos": request.ignore_eos,
@@ -692,6 +693,14 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False):
 async def v1_completions(tokenizer_manager, raw_request: Request):
     request_json = await raw_request.json()
+    if "extra_body" in request_json:
+        extra = request_json["extra_body"]
+        if "ebnf" in extra:
+            request_json["ebnf"] = extra["ebnf"]
+        if "regex" in extra:
+            request_json["regex"] = extra["regex"]
+        # remove extra_body to avoid pydantic conflict
+        del request_json["extra_body"]
     all_requests = [CompletionRequest(**request_json)]
     adapted_request, request = v1_generate_request(all_requests)
@@ -936,6 +945,7 @@ def v1_chat_generate_request(
             "frequency_penalty": request.frequency_penalty,
             "repetition_penalty": request.repetition_penalty,
             "regex": request.regex,
+            "ebnf": request.ebnf,
             "n": request.n,
             "no_stop_trim": request.no_stop_trim,
             "ignore_eos": request.ignore_eos,
@@ -1108,6 +1118,15 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
 async def v1_chat_completions(tokenizer_manager, raw_request: Request):
     request_json = await raw_request.json()
+    if "extra_body" in request_json:
+        extra = request_json["extra_body"]
+        # For example, if 'ebnf' is given:
+        if "ebnf" in extra:
+            request_json["ebnf"] = extra["ebnf"]
+        if "regex" in extra:
+            request_json["regex"] = extra["regex"]
+        # remove extra_body to avoid pydantic conflict
+        del request_json["extra_body"]
     all_requests = [ChatCompletionRequest(**request_json)]
     adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -179,6 +179,7 @@ class CompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    ebnf: Optional[str] = None
 class CompletionResponseChoice(BaseModel):
@@ -288,6 +289,7 @@ class ChatCompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    ebnf: Optional[str] = None
 class ChatMessage(BaseModel):

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -36,6 +36,7 @@ class SamplingParams:
         regex: Optional[str] = None,
         n: int = 1,
         json_schema: Optional[str] = None,
+        ebnf: Optional[str] = None,
         no_stop_trim: bool = False,
         ignore_eos: bool = False,
         skip_special_tokens: bool = True,
@@ -60,6 +61,7 @@ class SamplingParams:
         self.regex = regex
         self.n = n
         self.json_schema = json_schema
+        self.ebnf = ebnf
         self.no_stop_trim = no_stop_trim
         # Process some special cases
@@ -111,8 +113,13 @@ class SamplingParams:
                     f"min_new_tokens must be in (0, max_new_tokens({self.max_new_tokens})], got "
                     f"{self.min_new_tokens}."
                 )
-        if self.regex is not None and self.json_schema is not None:
-            raise ValueError("regex and json_schema cannot be both set.")
+        grammars = [
+            self.json_schema,
+            self.regex,
+            self.ebnf,
+        ]  # since mutually exclusive, only one can be set
+        if sum(x is not None for x in grammars) > 1:
+            raise ValueError("Only one of regex, json_schema, or ebnf can be set.")
     def normalize(self, tokenizer):
         # Process stop strings

sglang/srt/server.py CHANGED Viewed

@@ -245,16 +245,11 @@ async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
     try:
         ret = await tokenizer_manager.get_weights_by_name(obj, request)
         if ret is None:
-            return ORJSONResponse(
-                {"error": {"message": "Get parameter by name failed"}},
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
+            return _create_error_response("Get parameter by name failed")
         else:
             return ORJSONResponse(ret, status_code=200)
     except Exception as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 @app.api_route("/open_session", methods=["GET", "POST"])
@@ -264,9 +259,7 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
         session_id = await tokenizer_manager.open_session(obj, request)
         return session_id
     except Exception as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 @app.api_route("/close_session", methods=["GET", "POST"])
@@ -276,9 +269,7 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
         await tokenizer_manager.close_session(obj, request)
         return Response(status_code=200)
     except Exception as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 # fastapi implicitly converts json in the request to obj (dataclass)
@@ -312,9 +303,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
             return ret
         except ValueError as e:
             logger.error(f"Error: {e}")
-            return ORJSONResponse(
-                {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-            )
+            return _create_error_response(e)
 @app.api_route("/encode", methods=["POST", "PUT"])
@@ -325,9 +314,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
         ret = await tokenizer_manager.generate_request(obj, request).__anext__()
         return ret
     except ValueError as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 @app.api_route("/classify", methods=["POST", "PUT"])
@@ -338,9 +325,7 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
         ret = await tokenizer_manager.generate_request(obj, request).__anext__()
         return ret
     except ValueError as e:
-        return ORJSONResponse(
-            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
-        )
+        return _create_error_response(e)
 ##### OpenAI-compatible API endpoints #####
@@ -416,6 +401,12 @@ async def retrieve_file_content(file_id: str):
     return await v1_retrieve_file_content(file_id)
+def _create_error_response(e):
+    return ORJSONResponse(
+        {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+    )
 def launch_engine(
     server_args: ServerArgs,
 ):
@@ -849,12 +840,10 @@ class Engine:
             group_name=group_name,
             backend=backend,
         )
-        async def _init_group():
-            return await tokenizer_manager.init_weights_update_group(obj, None)
         loop = asyncio.get_event_loop()
-        return loop.run_until_complete(_init_group())
+        return loop.run_until_complete(
+            tokenizer_manager.init_weights_update_group(obj, None)
+        )
     def update_weights_from_distributed(self, name, dtype, shape):
         """Update weights from distributed source."""
@@ -863,22 +852,16 @@ class Engine:
             dtype=dtype,
             shape=shape,
         )
-        async def _update_weights():
-            return await tokenizer_manager.update_weights_from_distributed(obj, None)
         loop = asyncio.get_event_loop()
-        return loop.run_until_complete(_update_weights())
+        return loop.run_until_complete(
+            tokenizer_manager.update_weights_from_distributed(obj, None)
+        )
     def get_weights_by_name(self, name, truncate_size=100):
         """Get weights by parameter name."""
         obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
-        async def _get_weights():
-            return await tokenizer_manager.get_weights_by_name(obj, None)
         loop = asyncio.get_event_loop()
-        return loop.run_until_complete(_get_weights())
+        return loop.run_until_complete(tokenizer_manager.get_weights_by_name(obj, None))
 class Runtime:

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.1"
1	+ __version__ = "0.4.1.post1"

{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.4.1
+Version: 0.4.1.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -243,7 +243,7 @@ Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: flashinfer==0.1.6; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -358,8 +358,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
+The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
 ## Acknowledgment and Citation
 We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
-Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
+Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/RECORD RENAMED Viewed

@@ -4,14 +4,14 @@ sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
 sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
 sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
 sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
-sglang/bench_serving.py,sha256=3VQatM51v9f55aUQQ5crYMxxKHr1AbThicsWfBy_tjU,53190
+sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
 sglang/check_env.py,sha256=4OqpZaEJOfBM6-vtPILto5kqDmgiZM1Koc7lK78A7CI,8427
 sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
 sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
 sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
 sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
 sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
-sglang/version.py,sha256=pMtTmSUht-XtbR_7Doz6bsQqopJJd8rZ8I8zy2HwwoA,22
+sglang/version.py,sha256=ARioq8ApVNckeQorLPVfHZeN9mlHMLbaNgLGNbGq-ys,28
 sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
 sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -23,7 +23,7 @@ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
 sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
 sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
 sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
-sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
+sglang/lang/backend/openai.py,sha256=ha9a2P6T80TmSgYlyIwB1qYawWkjcOgiOptkktkqa1U,15436
 sglang/lang/backend/runtime_endpoint.py,sha256=dfs-yZ1ekKmnbpZLluQHWPmMeZJKbaaZRRGYRa9eBE8,10541
 sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
 sglang/srt/_custom_ops.py,sha256=Y4gyTDGhWz-W2Igq25Ojm8XFiyvkawW9I-79iwYvxJ0,3574
@@ -32,7 +32,7 @@ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21
 sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
 sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
 sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
-sglang/srt/server.py,sha256=E9YKKXpXv3vPvRy0-cgcy0-5UA-OZz42-32EZWKTicA,34661
+sglang/srt/server.py,sha256=vDucJl6qtEK2swzPJ_wYitaJvsI4MigMagGlBlH5V54,34033
 sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
 sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
 sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
@@ -45,7 +45,7 @@ sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO
 sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
 sglang/srt/constrained/outlines_backend.py,sha256=CipNHNNXs8xtnJNVNe6FCwZUlSbIXbGmWVlZz3hUpFQ,6820
 sglang/srt/constrained/outlines_jump_forward.py,sha256=iZWXeR3gNYoMubLGyFmLPO4V2YsN5DiGjD71Xk9iFaE,6418
-sglang/srt/constrained/xgrammar_backend.py,sha256=4It9_GqU4UZFhxIw_7hkzpXaMPUtksk6Xfe0Agsfw7A,4620
+sglang/srt/constrained/xgrammar_backend.py,sha256=76oUFXeB29bfnEVWa1-rIrwQm5jhuMlzAX10HtAq1fQ,4887
 sglang/srt/distributed/__init__.py,sha256=__tl9Frrf3PFrSyNYcn5i-y2rL-J4-Qn6RJwrsZ4xgc,83
 sglang/srt/distributed/communication_op.py,sha256=ZoIhboZyefiAwr-1K-wF3rAFSQ4Wt-RxXpsX443Gbt4,1157
 sglang/srt/distributed/parallel_state.py,sha256=HplRH5S0AWdwSdhoHYX9_UWQZlFjh2Z1LHaz68EXlpE,47555
@@ -77,20 +77,20 @@ sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDC
 sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
 sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
 sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
-sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=tZJhzqcf1KKMT8z7_32eVk_D1NHP71c-S3UNxemfAHM,11542
+sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=DWOZXSTVN5ZbcFjDjcqs-nPdUkxSwum0SVXhVKqwh2g,11688
 sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
 sglang/srt/layers/moe/fused_moe_native.py,sha256=8q-LFZMSCGLc2_Gltp2lH0gSb4A1WOuKQW3wo3rpj5g,1601
-sglang/srt/layers/moe/topk.py,sha256=YjIiFqMERvkChkwZUqTrL_xaQyzsYsZzVUe4PzAhRZI,6299
+sglang/srt/layers/moe/topk.py,sha256=JpeIl_-CNk0yyG3k5fmmNbbmR2_9bkKC23UoLOlMkjw,6954
 sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
 sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
 sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
-sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=GVugCKapd3CvgkvPQ_FmQplC12-grv3n1FRkLJc6WhY,30790
+sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=zXwWUtthLa9E35EvlQ9A_mnIsQyA0_NYKsUBdJqONHo,31163
 sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
 sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
 sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
 sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
-sglang/srt/layers/quantization/fp8_kernel.py,sha256=v4-7hCQFyuUSZmeJS_5VDCu6a1-EGWXQ088FdPTjO_0,8137
+sglang/srt/layers/quantization/fp8_kernel.py,sha256=eoO1enzD9jPC80id2oC3i8bt-LN6-4Ey223yOQ9yIPE,8792
 sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
 sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
 sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
@@ -100,10 +100,10 @@ sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLo
 sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
 sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
 sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
-sglang/srt/managers/schedule_policy.py,sha256=cLNi__smbg02keWgUMfB_nEM3vllocPB0XyG1P5qO7I,15469
-sglang/srt/managers/scheduler.py,sha256=3Olw4Yf4Qtn1i4PqK3PT9hkXYGE8nemL2_Xjn8JLxAQ,61819
+sglang/srt/managers/schedule_policy.py,sha256=QxjQ8-le062AMHHxool6CxkhvB4FIwhOQPzTX_JwL6U,15447
+sglang/srt/managers/scheduler.py,sha256=Yh15uQFhJlku8a20-lhtIsiEHAcUmpL3BzL42kLVwiI,61637
 sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
-sglang/srt/managers/tokenizer_manager.py,sha256=Vta7Lysvh4rPWqEB00shqAzpGUfv7GdPETDqFCU8RxA,31556
+sglang/srt/managers/tokenizer_manager.py,sha256=uKiTt__lCFXG60zQhmM_K7dU7IuedVSIQHVw3x3y5-E,31758
 sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
 sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
 sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
@@ -115,9 +115,9 @@ sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTlu
 sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
 sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
 sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
-sglang/srt/model_executor/model_runner.py,sha256=Bm3NWTS3xmOGXEJnucnJZQldpVOzu-DCEUfaJy_PTU0,30104
+sglang/srt/model_executor/model_runner.py,sha256=MLYBcYIQihu2I3PBTUghiU2mSWsDMzlKzcnX7yHa9JU,29837
 sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
-sglang/srt/model_loader/loader.py,sha256=VBrY4W9CiVvS_D8yXhdkW9jReV9rSMSkJplabz0Fxgk,43528
+sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
 sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
 sglang/srt/model_loader/weight_utils.py,sha256=kQo9KPThjH3HAOCfC_tdwdrshdWuWJOVpPR0skSyaRY,24193
 sglang/srt/models/baichuan.py,sha256=PzBOFcEAixakPEkQSaJwC0Xc1fu-yCsN9T0I67r8QmY,14919
@@ -128,7 +128,7 @@ sglang/srt/models/deepseek.py,sha256=_cVOvR6eSEgRf6TUBpTD5uMdijDWFw4sSt4lGzl8tbg
 sglang/srt/models/deepseek_v2.py,sha256=-v_OJr2c3gJ0NMxQjvT3Jknz1XPGkzKx0TVR3NIiC6A,37284
 sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
 sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
-sglang/srt/models/gemma2.py,sha256=41PlW8pMb4rMETdAni_JWDhZeIn_QsTQireAyUjsURA,15848
+sglang/srt/models/gemma2.py,sha256=-bFN-Te3YWAunLCrF-XFk_6fJS7gHM4Ca6h6aesXUTM,16362
 sglang/srt/models/gemma2_reward.py,sha256=nJ01KfqLSJtqMLm3sG8p2mGZFK1xhhjh7I7Ccb-_Hq8,2494
 sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
 sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
@@ -136,7 +136,7 @@ sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,
 sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
 sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
 sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
-sglang/srt/models/llama.py,sha256=S7nS05hhFGghXu0v-w9RZyBTY6OCEVF5Aaw4GX_E_9g,19929
+sglang/srt/models/llama.py,sha256=o3FYyOhkZJirzugyYz1kxs6RpY84O_uKowWWmt3jv24,19929
 sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
 sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
 sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
@@ -162,10 +162,10 @@ sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9R
 sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
 sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
 sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
-sglang/srt/openai_api/adapter.py,sha256=DbLA4-v-QrKJHYDH4fpDSXqmyz_vpcFE-1tnhh60m6o,54057
-sglang/srt/openai_api/protocol.py,sha256=ecRNNqkhwwKZaIoJlPhtp2VTcHxBJDbNN8lrKS7uBx8,10406
+sglang/srt/openai_api/adapter.py,sha256=X0HLuNhg-chDQjcdsQIRpZijlImEwZLHum3G0JgU4Go,54834
+sglang/srt/openai_api/protocol.py,sha256=RMzeDfh2tZITjhNwB2nX68wZwQe40N6HBuVebCzEWiU,10468
 sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
-sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
+sglang/srt/sampling/sampling_params.py,sha256=BkgCJAOSmQXwJrNXg26zSjKfMy0d5mMN6oHRk_ZuESI,5499
 sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
 sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
 sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
@@ -188,8 +188,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
 sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
 sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
 sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
-sglang-0.4.1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
-sglang-0.4.1.dist-info/METADATA,sha256=RlVEQtwr_CCGTs83vNPwWXQukutbFfBz9xBPlXSl6qc,22523
-sglang-0.4.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-sglang-0.4.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.4.1.dist-info/RECORD,,
+sglang-0.4.1.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
+sglang-0.4.1.post1.dist-info/METADATA,sha256=R2YDOrUU_49x5TEbNUODNlXvkSIzFqT7-hvInlSCs5k,22527
+sglang-0.4.1.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+sglang-0.4.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.4.1.post1.dist-info/RECORD,,

{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.4.1.dist-info → sglang-0.4.1.post1.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.4.1__py3-none-any.whl → 0.4.1.post1__py3-none-any.whl

sglang 0.4.1py3-none-any.whl → 0.4.1.post1py3-none-any.whl