PyPI - sglang - Versions diffs - 0.4.1__tar.gz → 0.4.1.post1__tar.gz - Mend

sglang 0.4.1tar.gz → 0.4.1.post1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (201) hide show

{sglang-0.4.1 → sglang-0.4.1.post1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.4.1
+Version: 0.4.1.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -243,7 +243,7 @@ Requires-Dist: torch; extra == "srt"
 Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: flashinfer==0.1.6; extra == "srt"
-Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
+Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -358,8 +358,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
+The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
 ## Acknowledgment and Citation
 We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
-Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
+Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

{sglang-0.4.1 → sglang-0.4.1.post1}/README.md RENAMED Viewed

@@ -57,8 +57,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 ## Adoption and Sponsorship
-The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
+The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
 ## Acknowledgment and Citation
 We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
-Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
+Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

{sglang-0.4.1 → sglang-0.4.1.post1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.1"
+version = "0.4.1.post1"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -23,7 +23,7 @@ runtime_common = ["aiohttp", "decord", "fastapi",
     "psutil", "pydantic", "python-multipart",
     "pyzmq>=25.1.2", "torchao>=0.7.0", "uvicorn", "uvloop",
     "xgrammar>=0.1.6"]
-srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post8"]
+srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1,<=0.6.4.post1", "cuda-python", "flashinfer==0.1.6", "sgl-kernel>=0.0.2.post10"]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl
@@ -60,6 +60,9 @@ dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
 "Homepage" = "https://github.com/sgl-project/sglang"
 "Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
+[tool.setuptools.package-data]
+"sglang" = ["srt/layers/fused_moe_triton/configs/*.json"]
 [tool.setuptools.packages.find]
 exclude = [
     "assets*",

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/bench_serving.py RENAMED Viewed

@@ -897,6 +897,7 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
+    # Limit concurrency
     # From https://github.com/vllm-project/vllm/pull/9390
     semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
@@ -906,6 +907,7 @@ async def benchmark(
         async with semaphore:
             return await request_func(request_func_input=request_func_input, pbar=pbar)
+    # Warmup
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
     test_input = RequestFuncInput(
@@ -924,11 +926,15 @@ async def benchmark(
             f"are correctly specified. Error: {test_output.error}"
         )
     else:
-        requests.post(base_url + "/flush_cache")
         print("Initial test run completed. Starting main benchmark run...")
-    time.sleep(1.5)
+    # Flush cache
+    if "sglang" in backend:
+        requests.post(base_url + "/flush_cache")
+    time.sleep(1.0)
+    # Start profiler
     if profile:
         print("Starting profiler...")
         profile_output = await async_request_profile(
@@ -939,6 +945,7 @@ async def benchmark(
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+    # Run all requests
     benchmark_start_time = time.perf_counter()
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
@@ -959,6 +966,7 @@ async def benchmark(
         )
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    # Stop profiler
     if profile:
         print("Stopping profiler...")
         profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
@@ -968,8 +976,8 @@ async def benchmark(
     if pbar is not None:
         pbar.close()
+    # Compute metrics and print results
     benchmark_duration = time.perf_counter() - benchmark_start_time
     metrics, output_lens = calculate_metrics(
         input_requests=input_requests,
         outputs=outputs,

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/lang/backend/openai.py RENAMED Viewed

@@ -366,6 +366,11 @@ class OpenAI(BaseBackend):
 def openai_completion(
     client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
 ):
+    # if "ebnf" is in kwargs, warn and remove
+    if "ebnf" in kwargs:
+        warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
+        del kwargs["ebnf"]
     for attempt in range(retries):
         try:
             if is_chat:
@@ -398,6 +403,11 @@ def openai_completion(
 def openai_completion_stream(
     client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
 ):
+    # if "ebnf" is in kwargs, warn and remove
+    if "ebnf" in kwargs:
+        warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
+        del kwargs["ebnf"]
     for attempt in range(retries):
         try:
             if is_chat:

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/constrained/xgrammar_backend.py RENAMED Viewed

@@ -126,6 +126,12 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
                     f"Skip invalid json_schema: json_schema={key_string}, {e=}"
                 )
                 return None
+        elif key_type == "ebnf":
+            try:
+                ctx = self.grammar_compiler.compile_grammar(key_string)
+            except RuntimeError as e:
+                logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
+                return None
         elif key_type == "regex":
             logger.warning(
                 "regex hasn't been supported by xgrammar yet. This is skipped."

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py RENAMED Viewed

@@ -292,27 +292,33 @@ def extend_attention_fwd(
         BLOCK_DPE = 0
     BLOCK_DV = triton.next_power_of_2(Lv)
-    if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
-        if Lq <= 256:
-            BLOCK_M, BLOCK_N = (128, 64)
-        else:
-            BLOCK_M, BLOCK_N = (32, 64)
-    elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
-        if Lq <= 128:
-            BLOCK_M, BLOCK_N = (128, 128)
-        elif Lq <= 256:
-            BLOCK_M, BLOCK_N = (64, 64)
-        else:
-            BLOCK_M, BLOCK_N = (32, 64)
+    if is_hip_:
+        BLOCK_M, BLOCK_N = (64, 64)
+        num_warps = 4
     else:
-        BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
+        if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
+            if Lq <= 256:
+                BLOCK_M, BLOCK_N = (128, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
+            if Lq <= 128:
+                BLOCK_M, BLOCK_N = (128, 128)
+            elif Lq <= 256:
+                BLOCK_M, BLOCK_N = (64, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        else:
+            BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
+        num_warps = 4 if Lk <= 64 else 8
     sm_scale = sm_scale or 1.0 / (Lq**0.5)
     batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
     kv_group_num = q_extend.shape[1] // k_extend.shape[1]
     grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
-    num_warps = 4 if Lk <= 64 else 8
     num_stages = 1
     extra_kargs = {}

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py RENAMED Viewed

@@ -11,12 +11,17 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
 import triton
 import triton.language as tl
-from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
 from vllm import _custom_ops as ops
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
-from sglang.srt.utils import direct_register_custom_op, get_device_name
+from sglang.srt.utils import direct_register_custom_op, get_device_name, is_hip
+not_hip = False
+if not is_hip():
+    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+    not_hip = True
 logger = logging.getLogger(__name__)
 padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
@@ -267,8 +272,14 @@ def moe_align_block_size(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-    # FIXME(zhyncs)
-    if num_experts >= 256:
+    if not_hip and num_experts >= 224:
+        token_cnts_buffer = torch.empty(
+            (num_experts + 1) * num_experts, dtype=torch.int32, device=topk_ids.device
+        )
+        cumsum_buffer = torch.empty(
+            num_experts + 1, dtype=torch.int32, device=topk_ids.device
+        )
         sgl_moe_align_block_size(
             topk_ids,
             num_experts,
@@ -276,6 +287,8 @@ def moe_align_block_size(
             sorted_ids,
             expert_ids,
             num_tokens_post_pad,
+            token_cnts_buffer,
+            cumsum_buffer,
         )
     else:
         ops.moe_align_block_size(

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/moe/topk.py RENAMED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 from typing import Callable, Optional
 import torch

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/layers/quantization/fp8_kernel.py RENAMED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 from typing import List, Tuple
 import torch

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/schedule_policy.py RENAMED Viewed

@@ -248,7 +248,7 @@ class PrefillAdder:
         self.can_run_list.append(req)
         self._prefill_one_req(
-            len(req.prefix_indices),
+            0,
             req.extend_input_len,
             (
                 min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/scheduler.py RENAMED Viewed

@@ -468,9 +468,6 @@ class Scheduler:
                 self.send_to_tokenizer.send_pyobj(
                     UpdateWeightFromDiskReqOutput(success, message)
                 )
-            elif isinstance(recv_req, GetWeightsByNameReqInput):
-                parameter = self.get_weights_by_name(recv_req)
-                self.send_to_tokenizer.send_pyobj(GetWeightsByNameReqOutput(parameter))
             elif isinstance(recv_req, InitWeightsUpdateGroupReqInput):
                 success, message = self.init_weights_update_group(recv_req)
                 self.send_to_tokenizer.send_pyobj(
@@ -565,7 +562,7 @@ class Scheduler:
         if req.logprob_start_len == -1:
             # By default, only return the logprobs for output tokens
-            req.logprob_start_len = len(recv_req.input_ids) - 1
+            req.logprob_start_len = len(req.origin_input_ids) - 1
         # Truncate prompts that are too long
         if len(req.origin_input_ids) > self.max_req_input_len:
@@ -589,12 +586,15 @@ class Scheduler:
         if (
             req.sampling_params.json_schema is not None
             or req.sampling_params.regex is not None
+            or req.sampling_params.ebnf is not None
         ):
             assert self.grammar_backend is not None
             if req.sampling_params.json_schema is not None:
                 key = ("json", req.sampling_params.json_schema)
             elif req.sampling_params.regex is not None:
                 key = ("regex", req.sampling_params.regex)
+            elif req.sampling_params.ebnf is not None:
+                key = ("ebnf", req.sampling_params.ebnf)
             req.grammar = self.grammar_backend.get_cached_value(key)
             if not req.grammar:
@@ -629,16 +629,13 @@ class Scheduler:
         self.waiting_queue.append(req)
     def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
-        if isinstance(self.tree_cache, RadixCache):
-            self.tree_cache_metrics["total"] += (
-                adder.log_input_tokens + adder.log_hit_tokens
-            ) / 10**9
-            self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
-            tree_cache_hit_rate = (
-                self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
-            )
-        else:
-            tree_cache_hit_rate = 0.0
+        self.tree_cache_metrics["total"] += (
+            adder.log_input_tokens + adder.log_hit_tokens
+        ) / 10**9
+        self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
+        tree_cache_hit_rate = (
+            self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
+        )
         num_used = self.max_total_num_tokens - (
             self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/managers/tokenizer_manager.py RENAMED Viewed

@@ -22,7 +22,7 @@ import signal
 import sys
 import time
 import uuid
-from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
+from typing import Any, Awaitable, Dict, Generic, List, Optional, Tuple, TypeVar, Union
 import fastapi
 import uvloop
@@ -173,6 +173,15 @@ class TokenizerManager:
         # Others
         self.gracefully_exit = False
+        self.init_weights_update_group_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_weights_from_distributed_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.get_weights_by_name_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
         # Metrics
         if self.enable_metrics:
@@ -190,8 +199,7 @@ class TokenizerManager:
     ):
         created_time = time.time()
-        if self.to_create_loop:
-            self.create_handle_loop()
+        self.auto_create_handle_loop()
         if isinstance(obj, EmbeddingReqInput) and self.is_generation:
             raise ValueError(
@@ -440,8 +448,7 @@ class TokenizerManager:
         obj: UpdateWeightFromDiskReqInput,
         request: Optional[fastapi.Request] = None,
     ) -> Tuple[bool, str]:
-        if self.to_create_loop:
-            self.create_handle_loop()
+        self.auto_create_handle_loop()
         # default the load format to the server_args
         if obj.load_format is None:
@@ -456,7 +463,7 @@ class TokenizerManager:
     async def _wait_for_model_update_from_disk(
         self, obj: UpdateWeightFromDiskReqInput
-    ) -> Tuple[bool, str, int]:
+    ) -> Tuple[bool, str]:
         self.send_to_scheduler.send_pyobj(obj)
         self.model_update_result = asyncio.Future()
         if self.server_args.dp_size == 1:
@@ -485,15 +492,11 @@ class TokenizerManager:
         obj: InitWeightsUpdateGroupReqInput,
         request: Optional[fastapi.Request] = None,
     ) -> Tuple[bool, str]:
-        if self.to_create_loop:
-            self.create_handle_loop()
-        self.send_to_scheduler.send_pyobj(obj)
-        self.init_weights_update_group_result = asyncio.Future()
+        self.auto_create_handle_loop()
         assert (
             self.server_args.dp_size == 1
         ), "dp_size must be 1 for init parameter update group"
-        result = await self.init_weights_update_group_result
+        result = (await self.init_weights_update_group_communicator(obj))[0]
         return result.success, result.message
     async def update_weights_from_distributed(
@@ -501,44 +504,32 @@ class TokenizerManager:
         obj: UpdateWeightsFromDistributedReqInput,
         request: Optional[fastapi.Request] = None,
     ) -> Tuple[bool, str]:
-        if self.to_create_loop:
-            self.create_handle_loop()
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1
+        ), "dp_size must be for update weights from distributed"
         # This means that weight sync
         # cannot run while requests are in progress.
         async with self.model_update_lock.writer_lock:
-            self.send_to_scheduler.send_pyobj(obj)
-            self.parameter_update_result: Awaitable[
-                UpdateWeightsFromDistributedReqOutput
-            ] = asyncio.Future()
-            assert (
-                self.server_args.dp_size == 1
-            ), "dp_size must be for update weights from distributed"
-            result = await self.parameter_update_result
+            result = (await self.update_weights_from_distributed_communicator(obj))[0]
             return result.success, result.message
     async def get_weights_by_name(
         self, obj: GetWeightsByNameReqInput, request: Optional[fastapi.Request] = None
     ):
-        if self.to_create_loop:
-            self.create_handle_loop()
-        self.send_to_scheduler.send_pyobj(obj)
-        self.get_weights_by_name_result = asyncio.Future()
+        self.auto_create_handle_loop()
+        results = await self.get_weights_by_name_communicator(obj)
+        all_parameters = [r.parameter for r in results]
         if self.server_args.dp_size == 1:
-            result = await self.get_weights_by_name_result
-            return result.parameter
+            return all_parameters[0]
         else:
-            self.get_weights_by_name_tmp = []
-            result = await self.get_weights_by_name_result
-            all_parameters = [r.parameter for r in result]
             return all_parameters
     async def open_session(
         self, obj: OpenSessionReqInput, request: Optional[fastapi.Request] = None
     ):
-        if self.to_create_loop:
-            self.create_handle_loop()
+        self.auto_create_handle_loop()
         session_id = uuid.uuid4().hex
         obj.session_id = session_id
@@ -568,7 +559,7 @@ class TokenizerManager:
         background_tasks.add_task(abort_request)
         return background_tasks
-    def create_handle_loop(self):
+    def auto_create_handle_loop(self):
         if not self.to_create_loop:
             return
@@ -711,21 +702,14 @@ class TokenizerManager:
                 assert (
                     self.server_args.dp_size == 1
                 ), "dp_size must be 1 for init parameter update group"
-                self.init_weights_update_group_result.set_result(recv_obj)
+                self.init_weights_update_group_communicator.handle_recv(recv_obj)
             elif isinstance(recv_obj, UpdateWeightsFromDistributedReqOutput):
                 assert (
                     self.server_args.dp_size == 1
                 ), "dp_size must be 1 for update weights from distributed"
-                self.parameter_update_result.set_result(recv_obj)
+                self.update_weights_from_distributed_communicator.handle_recv(recv_obj)
             elif isinstance(recv_obj, GetWeightsByNameReqOutput):
-                if self.server_args.dp_size == 1:
-                    self.get_weights_by_name_result.set_result(recv_obj)
-                else:
-                    self.get_weights_by_name_tmp.append(recv_obj)
-                    if len(self.get_weights_by_name_tmp) == self.server_args.dp_size:
-                        self.get_weights_by_name_result.set_result(
-                            self.get_weights_by_name_tmp
-                        )
+                self.get_weights_by_name_communicator.handle_recv(recv_obj)
             else:
                 raise ValueError(f"Invalid object: {recv_obj=}")
@@ -809,3 +793,28 @@ class SignalHandler:
             f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
         )
         self.tokenizer_manager.gracefully_exit = True
+T = TypeVar("T")
+class _Communicator(Generic[T]):
+    def __init__(self, sender, fan_out: int):
+        self._sender = sender
+        self._fan_out = fan_out
+        self._result_future: Optional[asyncio.Future] = None
+        self._result_values: Optional[List[T]] = None
+    async def __call__(self, obj):
+        self._sender.send_pyobj(obj)
+        self._result_future = asyncio.Future()
+        self._result_values = []
+        await self._result_future
+        result_values = self._result_values
+        self._result_future = self._result_values = None
+        return result_values
+    def handle_recv(self, recv_obj: T):
+        self._result_values.append(recv_obj)
+        if len(self._result_values) == self._fan_out:
+            self._result_future.set_result(None)

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_executor/model_runner.py RENAMED Viewed

@@ -95,12 +95,6 @@ class ModelRunner:
         ):
             logger.info("MLA optimization is turned on. Use triton backend.")
             self.server_args.attention_backend = "triton"
-            # FIXME(HandH1998)
-            if (
-                "DeepseekV3ForCausalLM" in self.model_config.hf_config.architectures
-                and not self.server_args.disable_cuda_graph
-            ):
-                self.server_args.disable_cuda_graph = True
         if self.server_args.enable_double_sparsity:
             logger.info(

{sglang-0.4.1 → sglang-0.4.1.post1}/sglang/srt/model_loader/loader.py RENAMED Viewed

@@ -770,6 +770,21 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             quant_state_dict,
         )
+    def _is_8bit_weight_name(self, weight_name: str):
+        quantized_suffix = {".scb", ".weight_format"}
+        return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
+    def _is_4bit_weight_name(self, weight_name: str):
+        quantized_suffix = {
+            "absmax",
+            "quant_map",
+            "nested_absmax",
+            "nested_quant_map",
+            "bitsandbytes",
+        }
+        suffix = weight_name.split(".")[-1]
+        return any(q_suffix in suffix for q_suffix in quantized_suffix)
     def _quantized_8bit_generator(
         self, hf_weights_files, use_safetensors, quant_state_dict
     ) -> Generator:
@@ -779,21 +794,18 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             if not weight_name.lower().endswith(".scb"):
                 continue
-            weight_key = weight_name.lower().replace(".scb", ".qweight")
+            weight_key = weight_name.lower().replace(".scb", ".weight")
             quant_state_dict[weight_key] = weight_tensor
         for weight_name, weight_tensor in self._hf_weight_iter(
             hf_weights_files, use_safetensors
         ):
-            if not weight_name.endswith((".weight", ".bias")):
+            if self._is_8bit_weight_name(weight_name):
                 continue
-            qweight_name = weight_name.replace(".weight", ".qweight")
-            if qweight_name in quant_state_dict:
+            if weight_name in quant_state_dict:
                 set_weight_attrs(weight_tensor, {"load_in_8bit": True})
-                yield qweight_name, weight_tensor
+                yield weight_name, weight_tensor
             else:
                 yield weight_name, weight_tensor
@@ -806,7 +818,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
         temp_state_dict = {}
         for weight_name, weight_tensor in weight_iterator:
-            if weight_name.endswith((".weight", ".bias")):
+            if not self._is_4bit_weight_name(weight_name):
                 continue
             # bitsandbytes library requires
             # weight.quant_state.bitsandbytes__* in CPU
@@ -830,16 +842,15 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             hf_weights_files, use_safetensors
         ):
-            if not weight_name.endswith((".weight", ".bias")):
+            if self._is_4bit_weight_name(weight_name):
                 continue
             if (f"{weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or (
                 f"{weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
             ):
                 quant_state = _parse_quant_state(weight_name, temp_state_dict)
-                weight_name = weight_name.replace(".weight", ".qweight")
                 quant_state_dict[weight_name] = quant_state
-                yield weight_name.replace(".weight", ".qweight"), weight_tensor
+                yield weight_name, weight_tensor
             else:
                 yield weight_name, weight_tensor

sglang 0.4.1__tar.gz → 0.4.1.post1__tar.gz

sglang 0.4.1tar.gz → 0.4.1.post1tar.gz