PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

sglang/bench_serving.py +23 -3
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +5 -16
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +218 -79
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/topk.py +30 -3
sglang/srt/layers/quantization/__init__.py +134 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +12 -0
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/schedule_batch.py +1 -0
sglang/srt/managers/scheduler.py +25 -19
sglang/srt/managers/tokenizer_manager.py +0 -1
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +9 -8
sglang/srt/model_executor/model_runner.py +9 -6
sglang/srt/model_loader/loader.py +11 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +151 -26
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +6 -0
sglang/srt/openai_api/adapter.py +88 -87
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/server_args.py +21 -11
sglang/srt/speculative/eagle_worker.py +1 -1
sglang/srt/utils.py +33 -0
sglang/test/runners.py +27 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +8 -4
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +57 -53
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/srt/lora/triton_ops/sgemm_lora_a.py CHANGED Viewed

@@ -12,8 +12,9 @@ def _sgemm_lora_a_kernel(
     weights,
     output,
     # Matrix dimensions
-    N,  # r
+    N,  # stack_num * r
     K,  # input_dim
+    stack_num,
     # Strides
     x_stride_0,
     x_stride_1,
@@ -22,10 +23,11 @@ def _sgemm_lora_a_kernel(
     w_stride_2,
     output_stride_0,
     output_stride_1,
-    # Information on sequence lengths and weight id
+    # Information on sequence lengths,ranks and weight id
     seg_lens,
     seg_indptr,
     weight_indices,
+    lora_ranks,
     # Meta parameters
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -43,6 +45,9 @@ def _sgemm_lora_a_kernel(
     seg_len = tl.load(seg_lens + batch_id)
     w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    # Adjust N (stack_num * max_rank) according to the specific LoRA adapter
+    N = tl.minimum(N, rank * stack_num)
     # The tile in output matrix will have (pid_s, pid_n) as id
     num_pid_n = tl.cdiv(N, BLOCK_N)
@@ -91,11 +96,15 @@ def _sgemm_lora_a_kernel(
 def sgemm_lora_a_fwd(
-    x: torch.Tensor, weights: torch.Tensor, batch_info: LoRABatchInfo
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info: LoRABatchInfo,
+    stack_num: int = 1,
 ) -> torch.Tensor:
     # x: (s, input_dim)
-    # weights: (num_lora, r, input_dim)
-    # output: (s, r)
+    # weights: (num_lora, stack_num * r, input_dim)
+    # output: (s, stack_num * r)
+    # stack_num: run_qkv_lora: 3, run_gate_up_lora: 2
     # when called by run_qkv_lora, the weights.shape[-2] will be 3 * r
     # input_dim is much larger than r
@@ -126,6 +135,7 @@ def sgemm_lora_a_fwd(
         output,
         R,
         K,
+        stack_num,
         x.stride(0),
         x.stride(1),
         weights.stride(0),
@@ -136,6 +146,7 @@ def sgemm_lora_a_fwd(
         batch_info.seg_lens,
         batch_info.seg_indptr,
         batch_info.weight_indices,
+        batch_info.lora_ranks,
         BLOCK_S,
         BLOCK_R,
         BLOCK_K,

sglang/srt/lora/triton_ops/sgemm_lora_b.py CHANGED Viewed

@@ -26,13 +26,14 @@ def _sgemm_lora_b_kernel(
     seg_lens,
     seg_indptr,
     weight_indices,
+    lora_ranks,
     # Meta parameters
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
     # For fused output scaling and adding
     fuse_scaling_add,
-    scaling,
+    scalings,
 ):
     # x: (s, K), s is the sum of sequence lengths
     # weights: (num_lora, N, K)
@@ -45,6 +46,10 @@ def _sgemm_lora_b_kernel(
     seg_len = tl.load(seg_lens + batch_id)
     w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    scaling = tl.load(scalings + w_index)
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
     # The tile in output matrix will have (pid_s, pid_n) as id
     num_pid_n = tl.cdiv(N, BLOCK_N)
@@ -100,12 +105,11 @@ def sgemm_lora_b_fwd(
     weights: torch.Tensor,
     batch_info: LoRABatchInfo,
     base_output: torch.Tensor = None,
-    scaling: float = 1.0,
 ) -> torch.Tensor:
-    # x: (s, r)
-    # weights: (num_lora, output_dim, r)
+    # x: (s, max_r)
+    # weights: (num_lora, output_dim, max_r)
     # output: (s, output_dim)
-    # output_dim is much larger than r
+    # output_dim is much larger than max_r
     assert x.is_contiguous()
     assert weights.is_contiguous()
@@ -150,10 +154,11 @@ def sgemm_lora_b_fwd(
         batch_info.seg_lens,
         batch_info.seg_indptr,
         batch_info.weight_indices,
+        batch_info.lora_ranks,
         BLOCK_S,
         BLOCK_N,
         BLOCK_R,
         fuse_scaling_add,
-        scaling,
+        batch_info.scalings,
     )
     return output

sglang/srt/lora/utils.py CHANGED Viewed

@@ -25,6 +25,12 @@ class LoRABatchInfo:
     # The index of lora adapter used by each sequence, in shape (bs,)
     weight_indices: torch.Tensor
+    # ranks of each lora adapter, in shape (lora_num,)
+    lora_ranks: torch.Tensor
+    # scaling of each lora adapter, in shape (lora_num,)
+    scalings: torch.Tensor
 class LoRAType(Enum):
     LORA_A = 0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -20,7 +20,7 @@ import copy
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -650,7 +650,7 @@ class ProfileReqInput:
     # If it is set, profiling is automatically stopped after this step, and
     # the caller doesn't need to run stop_profile.
     num_steps: Optional[int] = None
-    activities: Optional[List[str]] = None
+    activities: Optional[List[Literal["CPU", "GPU", "MEM", "CUDA_PROFILER"]]] = None
 class ProfileReqType(Enum):
@@ -675,6 +675,8 @@ class ProfileReq:
     output_dir: Optional[str] = None
     num_steps: Optional[int] = None
     activities: Optional[List[str]] = None
+    with_stack: Optional[bool] = None
+    record_shapes: Optional[bool] = None
 @dataclass

sglang/srt/managers/multimodal_processors/clip.py ADDED Viewed

@@ -0,0 +1,63 @@
+import asyncio
+from typing import List, Union
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    get_global_processor,
+)
+from sglang.srt.models.clip import CLIPModel
+from sglang.srt.utils import load_image
+class ClipImageProcessor(BaseMultimodalProcessor):
+    models = [CLIPModel]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+    @staticmethod
+    def _process_single_image_task(images, input_text):
+        # input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
+        return get_global_processor()(
+            images=images, text=input_text, return_tensors="pt"
+        )
+    async def _process_single_image(self, images, input_text):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            image_inputs = await loop.run_in_executor(
+                self.executor,
+                ClipImageProcessor._process_single_image_task,
+                images,
+                input_text,
+            )
+        else:
+            image_inputs = self._processor(
+                images=images, text=[input_text], return_tensors="pt"
+            )
+        return image_inputs
+    async def process_mm_data_async(
+        self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
+    ):
+        if not image_data:
+            return None
+        if isinstance(input_text, list):
+            assert len(input_text) and isinstance(input_text[0], int)
+            input_text = self._processor.tokenizer.decode(input_text)
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        if len(image_data) > 0:
+            images = [load_image(image)[0] for image in image_data]
+        else:
+            images = load_image(image_data[0])[0]
+        image_inputs = await self._process_single_image(images, input_text)
+        image_inputs["data_hashes"] = [hash(str(image_data))]
+        image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
+        return image_inputs

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -1376,6 +1376,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             if (
                 global_server_args_dict["enable_flashinfer_mla"]
                 or global_server_args_dict["enable_flashmla"]
+                or global_server_args_dict["attention_backend"] == "fa3"
             ):
                 decode_seq_lens = self.seq_lens.cpu()
             else:

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -379,7 +379,7 @@ class Scheduler(
         # Init profiler
         self.torch_profiler = None
         self.torch_profiler_output_dir: Optional[str] = None
-        self.torch_profiler_activities: Optional[List[str]] = None
+        self.profiler_activities: Optional[List[str]] = None
         self.profiler_target_forward_ct: Optional[int] = None
         # Init metrics stats
@@ -1186,7 +1186,7 @@ class Scheduler(
                 ret = None
         # Handle DP attention
-        if self.server_args.enable_dp_attention:
+        if self.server_args.enable_dp_attention or self.server_args.enable_sp_layernorm:
             ret, _ = self.prepare_dp_attn_batch(ret)
         return ret
@@ -1703,18 +1703,12 @@ class Scheduler(
     def save_remote_model(self, params):
         url = params["url"]
-        if isinstance(self.tp_worker, TpModelWorkerClient):
-            worker = self.tp_worker.worker
-        else:
-            worker = self.tp_worker
+        worker = self.tp_worker.worker
         worker.model_runner.save_remote_model(url)
     def save_sharded_model(self, params):
-        if isinstance(self.tp_worker, TpModelWorkerClient):
-            worker = self.tp_worker.worker
-        else:
-            worker = self.tp_worker
+        worker = self.tp_worker.worker
         worker.model_runner.save_sharded_model(
             path=params["path"],
@@ -1813,7 +1807,11 @@ class Scheduler(
     def profile(self, recv_req: ProfileReq):
         if recv_req.type == ProfileReqType.START_PROFILE:
             return self.start_profile(
-                recv_req.output_dir, recv_req.num_steps, recv_req.activities
+                recv_req.output_dir,
+                recv_req.num_steps,
+                recv_req.activities,
+                recv_req.with_stack,
+                recv_req.record_shapes,
             )
         else:
             return self.stop_profile()
@@ -1823,8 +1821,10 @@ class Scheduler(
         output_dir: Optional[str],
         num_steps: Optional[int],
         activities: Optional[List[str]],
+        with_stack: Optional[bool],
+        record_shapes: Optional[bool],
     ) -> None:
-        if self.torch_profiler_activities:
+        if self.profiler_activities:
             return ProfileReqOutput(
                 success=False,
                 message="Profiling is already in progress. Call /stop_profile first.",
@@ -1836,7 +1836,7 @@ class Scheduler(
             activities = ["CPU", "GPU"]
         self.torch_profiler_output_dir = output_dir
-        self.torch_profiler_activities = activities
+        self.profiler_activities = activities
         logger.info(
             "Profiling starts. Traces will be saved to: %s",
             self.torch_profiler_output_dir,
@@ -1853,13 +1853,17 @@ class Scheduler(
         if torchprof_activities:
             self.torch_profiler = torch.profiler.profile(
                 activities=torchprof_activities,
-                with_stack=True,
+                with_stack=with_stack if with_stack is not None else True,
+                record_shapes=record_shapes if record_shapes is not None else False,
             )
             self.torch_profiler.start()
         if "MEM" in activities:
             torch.cuda.memory._record_memory_history(max_entries=100000)
+        if "CUDA_PROFILER" in activities:
+            torch.cuda.cudart().cudaProfilerStart()
         if num_steps:
             self.profiler_target_forward_ct = self.forward_ct + num_steps
             # The caller will be notified when reaching profiler_target_forward_ct
@@ -1868,7 +1872,7 @@ class Scheduler(
             return ProfileReqOutput(success=True, message="Succeeded")
     def stop_profile(self) -> None:
-        if self.torch_profiler_activities is None:
+        if self.profiler_activities is None:
             return
         logger.info("Stop profiling...")
@@ -1881,21 +1885,24 @@ class Scheduler(
                 )
             )
-        if "MEM" in self.torch_profiler_activities:
+        if "MEM" in self.profiler_activities:
             memory_profile_path = os.path.join(
-                self.torch_profiler_trace_dir,
+                self.torch_profiler_output_dir,
                 str(time.time()) + f"-TP-{self.tp_rank}-memory" + ".pickle",
             )
             torch.cuda.memory._dump_snapshot(memory_profile_path)
             torch.cuda.memory._record_memory_history(enabled=None)
+        if "CUDA_PROFILER" in self.profiler_activities:
+            torch.cuda.cudart().cudaProfilerStop()
         logger.info(
             "Profiling done. Traces are saved to: %s",
             self.torch_profiler_output_dir,
         )
         self.torch_profiler = None
         self.torch_profiler_output_dir = None
-        self.torch_profiler_activities = None
+        self.profiler_activities = None
         if self.profiler_target_forward_ct:
             self.send_to_tokenizer.send_pyobj(
@@ -1963,7 +1970,6 @@ def run_scheduler_process(
     dp_rank: Optional[int],
     pipe_writer,
 ):
     # Generate the prefix
     if dp_rank is None:
         prefix = f" TP{tp_rank}"

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -261,7 +261,6 @@ class TokenizerManager:
         self.start_profile_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
         )
-        self.health_check_communitcator = _Communicator(self.send_to_scheduler, 1)
         self.get_internal_state_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
         )

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -132,6 +132,9 @@ class TpModelWorker:
         )[0]
         set_random_seed(self.random_seed)
+        # A reference make this class has the same member as TpModelWorkerClient
+        self.worker = self
     def get_worker_info(self):
         return (
             self.max_total_num_tokens,

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -124,8 +124,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
             # capture less.
             capture_bs = list(range(1, 9)) + list(range(9, 33, 2)) + [64, 96, 128, 160]
-    if _is_hip:
-        capture_bs += [i * 8 for i in range(21, 33)]
+        if _is_hip:
+            capture_bs += [i * 8 for i in range(21, 33)]
     if max(capture_bs) > model_runner.req_to_token_pool.size:
         # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
@@ -174,6 +174,7 @@ class CudaGraphRunner:
         self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
         self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
         self.enable_dp_attention = model_runner.server_args.enable_dp_attention
+        self.enable_sp_layernorm = model_runner.server_args.enable_sp_layernorm
         self.speculative_algorithm = model_runner.server_args.speculative_algorithm
         self.tp_size = model_runner.server_args.tp_size
         self.dp_size = model_runner.server_args.dp_size
@@ -245,8 +246,8 @@ class CudaGraphRunner:
                 )
             else:
                 self.encoder_lens = None
-            if self.enable_dp_attention:
+            if self.enable_dp_attention or self.enable_sp_layernorm:
+                # TODO(ch-wan): SP layernorm should use a different logic to manage gathered_buffer
                 self.gathered_buffer = torch.zeros(
                     (
                         self.max_bs * self.dp_size * self.num_tokens_per_bs,
@@ -288,7 +289,7 @@ class CudaGraphRunner:
             self.model_runner.token_to_kv_pool.capture_mode = False
     def can_run(self, forward_batch: ForwardBatch):
-        if self.enable_dp_attention:
+        if self.enable_dp_attention or self.enable_sp_layernorm:
             total_global_tokens = sum(forward_batch.global_num_tokens_cpu)
             is_bs_supported = forward_batch.can_run_dp_cuda_graph and (
@@ -369,7 +370,7 @@ class CudaGraphRunner:
             encoder_lens = None
         mrope_positions = self.mrope_positions[:, :bs]
-        if self.enable_dp_attention:
+        if self.enable_dp_attention or self.enable_sp_layernorm:
             self.global_num_tokens_gpu.copy_(
                 torch.tensor(
                     [
@@ -471,7 +472,7 @@ class CudaGraphRunner:
         raw_num_token = raw_bs * self.num_tokens_per_bs
         # Pad
-        if self.enable_dp_attention:
+        if self.enable_dp_attention or self.enable_sp_layernorm:
             index = bisect.bisect_left(
                 self.capture_bs, sum(forward_batch.global_num_tokens_cpu)
             )
@@ -497,7 +498,7 @@ class CudaGraphRunner:
             self.encoder_lens[:raw_bs].copy_(forward_batch.encoder_lens)
         if forward_batch.mrope_positions is not None:
             self.mrope_positions[:, :raw_bs].copy_(forward_batch.mrope_positions)
-        if self.enable_dp_attention:
+        if self.enable_dp_attention or self.enable_sp_layernorm:
             self.global_num_tokens_gpu.copy_(forward_batch.global_num_tokens_gpu)
         if hasattr(forward_batch.spec_info, "hidden_states"):

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -64,6 +64,7 @@ from sglang.srt.model_loader.loader import (
 )
 from sglang.srt.model_loader.utils import set_default_torch_dtype
 from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.patch_torch import monkey_patch_torch_reductions
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
@@ -229,6 +230,10 @@ class ModelRunner:
                 elif server_args.enable_flashmla:
                     logger.info("MLA optimization is turned on. Use flashmla decode.")
                     server_args.attention_backend = "flashmla"
+                elif server_args.attention_backend == "fa3":
+                    logger.info(
+                        f"MLA optimization is turned on. Use flash attention 3 backend."
+                    )
                 else:
                     logger.info("MLA optimization is turned on. Use triton backend.")
                     server_args.attention_backend = "triton"
@@ -280,9 +285,6 @@ class ModelRunner:
         if server_args.enable_deepep_moe:
             logger.info("DeepEP is turned on.")
-            assert (
-                server_args.enable_dp_attention == True
-            ), "Currently DeepEP is bind to Attention DP. Set '--enable-dp-attention --enable-deepep-moe'"
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
@@ -881,7 +883,7 @@ class ModelRunner:
                 "Please use `--attention-backend flashinfer`."
             )
             logger.warning(
-                "FlashAttention v3 Backend is in Beta. Multimodal, Page > 1, FP8, MLA and Speculative Decoding are not supported."
+                "FlashAttention v3 Backend is in Beta. Multimodal, FP8, and Speculative Decoding are not supported."
             )
             from sglang.srt.layers.attention.flashattention_backend import (
                 FlashAttentionBackend,
@@ -1082,8 +1084,9 @@ def _model_load_weights_direct(model, named_tensors: List[Tuple[str, torch.Tenso
 def _unwrap_tensor(tensor, tp_rank):
     if isinstance(tensor, LocalSerializedTensor):
-        return tensor.get(tp_rank)
-    return tensor
+        monkey_patch_torch_reductions()
+        tensor = tensor.get(tp_rank)
+    return tensor.to(torch.cuda.current_device())
 @dataclass

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -14,7 +14,6 @@ from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
-import gguf
 import huggingface_hub
 import numpy as np
 import torch
@@ -1155,6 +1154,17 @@ class GGUFModelLoader(BaseModelLoader):
         See "Standardized tensor names" in
         https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
         """
+        # only load the gguf module when needed
+        try:
+            import gguf
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install gguf via `pip install gguf` to use gguf quantizer."
+            ) from err
         config = model_config.hf_config
         model_type = config.model_type
         # hack: ggufs have a different name than transformers

sglang/srt/model_loader/weight_utils.py CHANGED Viewed

@@ -22,7 +22,6 @@ from typing import (
 )
 import filelock
-import gguf
 import huggingface_hub.constants
 import numpy as np
 import safetensors.torch
@@ -93,7 +92,7 @@ def convert_bin_to_safetensor_file(
     pt_filename: str,
     sf_filename: str,
 ) -> None:
-    loaded = torch.load(pt_filename, map_location="cpu")
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
     if "state_dict" in loaded:
         loaded = loaded["state_dict"]
     shared = _shared_pointers(loaded)
@@ -381,7 +380,7 @@ def np_cache_weights_iterator(
                 disable=not enable_tqdm,
                 bar_format=_BAR_FORMAT,
             ):
-                state = torch.load(bin_file, map_location="cpu")
+                state = torch.load(bin_file, map_location="cpu", weights_only=True)
                 for name, param in state.items():
                     param_path = os.path.join(np_folder, name)
                     with open(param_path, "wb") as f:
@@ -464,6 +463,8 @@ def pt_weights_iterator(
 def get_gguf_extra_tensor_names(
     gguf_file: str, gguf_to_hf_name_map: Dict[str, str]
 ) -> List[str]:
+    import gguf
     reader = gguf.GGUFReader(gguf_file)
     expected_gguf_keys = set(gguf_to_hf_name_map.keys())
     exact_gguf_keys = set([tensor.name for tensor in reader.tensors])
@@ -479,6 +480,8 @@ def gguf_quant_weights_iterator(
     them to torch tensors
     """
+    import gguf
     reader = gguf.GGUFReader(gguf_file)
     for tensor in reader.tensors:

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post3py3-none-any.whl