PyPI - sglang - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl - Mend

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

sglang/bench_latency.py +2 -1
sglang/lang/chat_template.py +17 -0
sglang/launch_server_llavavid.py +1 -1
sglang/srt/configs/__init__.py +3 -0
sglang/srt/configs/model_config.py +27 -2
sglang/srt/configs/qwen2vl.py +133 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/conversation.py +27 -0
sglang/srt/hf_transformers_utils.py +16 -1
sglang/srt/layers/attention/__init__.py +16 -5
sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
sglang/srt/layers/attention/flashinfer_backend.py +174 -54
sglang/srt/layers/attention/triton_backend.py +22 -6
sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
sglang/srt/layers/linear.py +89 -63
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/rotary_embedding.py +112 -0
sglang/srt/layers/sampler.py +51 -39
sglang/srt/lora/lora.py +3 -1
sglang/srt/managers/data_parallel_controller.py +1 -1
sglang/srt/managers/detokenizer_manager.py +4 -0
sglang/srt/managers/image_processor.py +186 -13
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/schedule_batch.py +238 -68
sglang/srt/managers/scheduler.py +69 -50
sglang/srt/managers/tokenizer_manager.py +24 -4
sglang/srt/managers/tp_worker.py +26 -111
sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
sglang/srt/mem_cache/memory_pool.py +56 -10
sglang/srt/mem_cache/radix_cache.py +4 -3
sglang/srt/model_executor/cuda_graph_runner.py +87 -28
sglang/srt/model_executor/forward_batch_info.py +83 -3
sglang/srt/model_executor/model_runner.py +32 -11
sglang/srt/models/chatglm.py +3 -3
sglang/srt/models/deepseek_v2.py +2 -2
sglang/srt/models/mllama.py +1004 -0
sglang/srt/models/qwen2_vl.py +724 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +13 -3
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +12 -0
sglang/srt/server_args.py +10 -0
sglang/srt/utils.py +22 -0
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +20 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +100 -3
sglang/version.py +1 -1
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/linear.py CHANGED Viewed

@@ -20,8 +20,10 @@ from vllm.distributed import (
 from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.parameter import (
     BasevLLMParameter,
+    PackedColumnParameter,
     PackedvLLMParameter,
     PerTensorScaleParameter,
+    RowvLLMParameter,
 )
 from sglang.srt.layers.quantization.base_config import (
@@ -39,6 +41,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "GPTQMarlinLinearMethod",
     "Fp8LinearMethod",
     "MarlinLinearMethod",
+    "GPTQLinearMethod",
 ]
@@ -50,7 +53,7 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
-def adjust_bitsandbytes_shard(
+def adjust_bitsandbytes_4bit_shard(
     param: Parameter, qkv_offsets: Dict[str, Tuple[int, int]], loaded_shard_id: str
 ) -> Tuple[int, int]:
     """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
@@ -207,7 +210,6 @@ class ReplicatedLinear(LinearBase):
             self.output_size,
             self.params_dtype,
             weight_loader=self.weight_loader,
-            prefix=prefix,
         )
         if bias:
@@ -315,7 +317,6 @@ class ColumnParallelLinear(LinearBase):
                 if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
                 else self.weight_loader
             ),
-            prefix=prefix,
         )
         if bias:
             self.bias = Parameter(
@@ -345,8 +346,12 @@ class ColumnParallelLinear(LinearBase):
         if is_gguf_weight and isinstance(param, UninitializedParameter):
             param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
         param_data = param.data
-        if output_dim is not None:
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow here
+        if output_dim is not None and not use_bitsandbytes_4bit:
             shard_size = param_data.shape[output_dim]
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
@@ -454,17 +459,22 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
             return
-        if is_gguf_weight and isinstance(param, UninitializedParameter):
-            from gguf.constants import GGML_QUANT_SIZES
+        if is_gguf_weight:
+            tp_size = get_tensor_model_parallel_world_size()
+            tp_rank = get_tensor_model_parallel_rank()
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // tp_size
+            start_idx = tp_rank * shard_size
-            ori_shape = param.tensor_shape
-            weight_types = self.qweight_type.shard_weight_type.values()
-            row_size = []
-            for weight_type in weight_types:
-                block_size, type_size = GGML_QUANT_SIZES[weight_type]
-                row_size.append(ori_shape[1] // block_size * type_size)
-            q_shape = (ori_shape[0], max(row_size))
-            param.materialize(q_shape, dtype=loaded_weight.dtype)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            param.shard_id.append(loaded_shard_id)
+            param.shard_id_map[loaded_shard_id] = len(param.data_container)
+            param.data_container.append(loaded_weight)
+            if len(param.data_container) == 2:
+                self.qweight = param.materialize_nested()
+            return
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
@@ -526,26 +536,17 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                     param, shard_size, shard_offset
                 )
-            use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
-            if use_bitsandbytes:
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if use_bitsandbytes_4bit:
                 shard_size = loaded_weight.shape[output_dim]
                 shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id
-            if is_gguf_weight:
-                tp_size = get_tensor_model_parallel_world_size()
-                output_dim = getattr(param, "output_dim", None)
-                shard_shape = list(loaded_weight.shape)
-                shard_shape[output_dim] = shard_shape[output_dim] // tp_size
-                param.shard_id.append(loaded_shard_id)
-                param.shard_size[loaded_shard_id] = shard_shape
-                input_dim = getattr(param, "input_dim", None)
-                input_size = loaded_weight.shape[input_dim]
-                param_data = param_data.narrow(input_dim, 0, input_size)
             param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             start_idx = tp_rank * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            # bitsandbytes loads the weights of the specific portion
+            # no need to narrow here
+            if not use_bitsandbytes_4bit:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for AQLM codebooks.
         elif is_metadata:
             # metadata indicates fixed size concatenated along dim 0
@@ -595,7 +596,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
             if (
-                isinstance(param, PackedvLLMParameter)
+                isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
                 and param.packed_dim == param.output_dim
             ):
                 shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
@@ -617,7 +618,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             if isinstance(param, PerTensorScaleParameter):
                 param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0)
                 return
-            elif type(param) is BasevLLMParameter:
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
                 param.load_merged_column_weight(loaded_weight=loaded_weight)
                 return
             self._load_fused_module_from_checkpoint(param, loaded_weight)
@@ -760,7 +761,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
             if (
-                isinstance(param, PackedvLLMParameter)
+                isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
                 and param.packed_dim == param.output_dim
             ):
                 shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
@@ -780,10 +781,10 @@ class QKVParallelLinear(ColumnParallelLinear):
     ):
         if loaded_shard_id is None:  # special case for certain models
             if isinstance(param, PerTensorScaleParameter):
-                param.load_merged_column_weight(loaded_weight=loaded_weight, shard_id=0)
+                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
                 return
-            elif type(param) is BasevLLMParameter:
-                param.load_merged_column_weight(loaded_weight=loaded_weight)
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+                param.load_qkv_weight(loaded_weight=loaded_weight)
                 return
             self._load_fused_module_from_checkpoint(param, loaded_weight)
             return
@@ -818,17 +819,22 @@ class QKVParallelLinear(ColumnParallelLinear):
             param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
             return
-        if is_gguf_weight and isinstance(param, UninitializedParameter):
-            from gguf.constants import GGML_QUANT_SIZES
+        if is_gguf_weight:
+            tp_size = get_tensor_model_parallel_world_size()
+            tp_rank = get_tensor_model_parallel_rank()
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // tp_size
+            start_idx = tp_rank * shard_size
-            ori_shape = param.tensor_shape
-            weight_types = self.qweight_type.shard_weight_type.values()
-            row_size = []
-            for weight_type in weight_types:
-                block_size, type_size = GGML_QUANT_SIZES[weight_type]
-                row_size.append(ori_shape[1] // block_size * type_size)
-            q_shape = (ori_shape[0], max(row_size))
-            param.materialize(q_shape, dtype=loaded_weight.dtype)
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            param.shard_id.append(loaded_shard_id)
+            param.shard_id_map[loaded_shard_id] = len(param.data_container)
+            param.data_container.append(loaded_weight)
+            if len(param.data_container) == 3:
+                self.qweight = param.materialize_nested()
+            return
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
@@ -863,6 +869,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                     self.total_num_kv_heads * self.head_size,
                 ),
             ]
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
             packed_dim = getattr(param, "packed_dim", None)
             for shard_id, shard_offset, shard_size in shard_offsets:
                 # Special case for Quantized Weights.
@@ -877,6 +885,29 @@ class QKVParallelLinear(ColumnParallelLinear):
                         param, shard_size, shard_offset
                     )
+                if use_bitsandbytes_4bit:
+                    orig_qkv_offsets = {
+                        "q": (0, self.total_num_heads * self.head_size),
+                        "k": (
+                            self.total_num_heads * self.head_size,
+                            self.total_num_kv_heads * self.head_size,
+                        ),
+                        "v": (
+                            (self.total_num_heads + self.total_num_kv_heads)
+                            * self.head_size,
+                            self.total_num_kv_heads * self.head_size,
+                        ),
+                        "total": (
+                            (self.total_num_heads + 2 * self.total_num_kv_heads)
+                            * self.head_size,
+                            0,
+                        ),
+                    }
+                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                        param, orig_qkv_offsets, shard_id
+                    )
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size
                 )
@@ -910,8 +941,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                     param, shard_size, shard_offset
                 )
-            use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
-            if use_bitsandbytes:
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if use_bitsandbytes_4bit:
                 orig_qkv_offsets = {
                     "q": (0, self.num_heads * self.head_size),
                     "k": (
@@ -927,29 +958,22 @@ class QKVParallelLinear(ColumnParallelLinear):
                         0,
                     ),
                 }
-                shard_size, shard_offset = adjust_bitsandbytes_shard(
+                shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
                     param, orig_qkv_offsets, loaded_shard_id
                 )
-            if is_gguf_weight:
-                tp_size = get_tensor_model_parallel_world_size()
-                output_dim = getattr(param, "output_dim", None)
-                shard_shape = list(loaded_weight.shape)
-                shard_shape[output_dim] = shard_shape[output_dim] // tp_size
-                param.shard_id.append(loaded_shard_id)
-                param.shard_size[loaded_shard_id] = shard_shape
-                input_dim = getattr(param, "input_dim", None)
-                input_size = loaded_weight.shape[input_dim]
-                param_data = param_data.narrow(input_dim, 0, input_size)
             param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             if loaded_shard_id == "q":
                 shard_id = tp_rank
             else:
                 shard_id = tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
-            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+            # bitsandbytes loads the weights of the specific portion
+            # no need to narrow here
+            if not use_bitsandbytes_4bit:
+                loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for for AQLM codebooks.
         elif is_metadata:
             # metadata indicates fixed size concatenated along dim 0
@@ -1037,7 +1061,6 @@ class RowParallelLinear(LinearBase):
                 if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
                 else self.weight_loader
             ),
-            prefix=prefix,
         )
         if not reduce_results and (bias and not skip_bias_add):
             raise ValueError(
@@ -1061,6 +1084,7 @@ class RowParallelLinear(LinearBase):
         tp_rank = get_tensor_model_parallel_rank()
         tp_size = get_tensor_model_parallel_world_size()
         input_dim = getattr(param, "input_dim", None)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
         # Special case for GGUF
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
@@ -1076,7 +1100,9 @@ class RowParallelLinear(LinearBase):
             param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
         param_data = param.data
-        if input_dim is not None:
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow here
+        if input_dim is not None and not use_bitsandbytes_4bit:
             shard_size = param_data.shape[input_dim]
             start_idx = tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -33,17 +33,17 @@ class LogitsProcessorOutput:
     # The logits of the next tokens.       shape: [#seq, vocab_size]
     next_token_logits: torch.Tensor
     # The logprobs of the next tokens.     shape: [#seq, vocab_size]
-    next_token_logprobs: torch.Tensor
+    next_token_logprobs: torch.Tensor = None
     # The normlaized logprobs of prompts.  shape: [#seq]
-    normalized_prompt_logprobs: torch.Tensor
+    normalized_prompt_logprobs: torch.Tensor = None
     # The logprobs of input tokens.        shape: [#token, vocab_size]
-    input_token_logprobs: torch.Tensor
+    input_token_logprobs: torch.Tensor = None
     # The logprob and id of the top-k tokens in input positions.  shape [#seq, #token, k] of Tuple(logprob, token_id)
-    input_top_logprobs: List
+    input_top_logprobs: List = None
     # The logprob and id of the top-k tokens in output positions. shape [#seq, #token, k] of Tuple(logprob, token_id)
-    output_top_logprobs: List
+    output_top_logprobs: List = None
 @dataclasses.dataclass

sglang/srt/layers/rotary_embedding.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""MRotaryEmbedding"""
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+class MRotaryEmbedding:
+    """Rotary Embedding with Multimodal Sections."""
+    @staticmethod
+    def get_input_positions(
+        input_tokens: torch.Tensor,
+        image_grid_thw: Union[List[List[int]], torch.Tensor],
+        vision_start_token_id: int,
+        spatial_merge_size: int,
+        context_len: int = 0,
+    ) -> Tuple[List[List[int]], int]:
+        """Get mrope input positions and delta value."""
+        if isinstance(image_grid_thw, torch.Tensor):
+            image_grid_thw = image_grid_thw.tolist()
+        vision_start_indices = torch.argwhere(
+            input_tokens == vision_start_token_id
+        ).squeeze(1)
+        image_indices = vision_start_indices + 1
+        image_nums = image_indices.shape[0]
+        llm_pos_ids_list: list = []
+        st = 0
+        input_tokens_len = input_tokens.shape[0]
+        for image_index in range(image_nums):
+            ed = image_indices[image_index].item()
+            t, h, w = (
+                image_grid_thw[image_index][0],
+                image_grid_thw[image_index][1],
+                image_grid_thw[image_index][2],
+            )
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+            t_index = (
+                torch.arange(llm_grid_t)
+                .view(-1, 1)
+                .expand(-1, llm_grid_h * llm_grid_w)
+                .flatten()
+            )
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+        if st < input_tokens_len:
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = input_tokens_len - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:]
+        mrope_position_delta = (llm_positions.max() + 1 - input_tokens_len).item()
+        return llm_positions.tolist(), mrope_position_delta
+    @staticmethod
+    def get_next_input_positions(
+        mrope_position_delta: int,
+        context_len: int,
+        seq_len: int,
+    ) -> List[List[int]]:
+        return [
+            list(
+                range(
+                    context_len + mrope_position_delta, seq_len + mrope_position_delta
+                )
+            )
+            for _ in range(3)
+        ]

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import os
 from typing import Union
 import torch
@@ -17,6 +18,11 @@ if is_flashinfer_available():
         top_p_renorm_prob,
     )
+# Crash on warning if we are running CI tests
+crash_on_warning = os.getenv("SGLANG_IS_IN_CI", "false") == "true"
 logger = logging.getLogger(__name__)
@@ -33,56 +39,62 @@ class Sampler(nn.Module):
         if isinstance(logits, LogitsProcessorOutput):
             logits = logits.next_token_logits
-        # Post process logits
         logits = logits.contiguous()
-        logits.div_(sampling_info.temperatures)
-        probs = torch.softmax(logits, dim=-1)
-        logits = None
-        del logits
-        if self.use_nan_detectioin and torch.any(torch.isnan(probs)):
-            logger.warning("Detected errors during sampling! NaN in the probability.")
-            probs = torch.where(
-                torch.isnan(probs), torch.full_like(probs, 1e-10), probs
+        if self.use_nan_detectioin and torch.any(torch.isnan(logits)):
+            logger.warning("Detected errors during sampling! NaN in the logits.")
+            logits = torch.where(
+                torch.isnan(logits), torch.full_like(logits, -1e5), logits
             )
+            exit(1) if crash_on_warning else None
         if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling
-            batch_next_token_ids = torch.argmax(probs, -1)
-        elif global_server_args_dict["sampling_backend"] == "flashinfer":
-            max_top_k_round, batch_size = 32, probs.shape[0]
-            uniform_samples = torch.rand(
-                (max_top_k_round, batch_size), device=probs.device
-            )
-            if sampling_info.need_min_p_sampling:
-                probs = top_k_renorm_prob(probs, sampling_info.top_ks)
-                probs = top_p_renorm_prob(probs, sampling_info.top_ps)
-                batch_next_token_ids, success = min_p_sampling_from_probs(
-                    probs, uniform_samples, sampling_info.min_ps
+            batch_next_token_ids = torch.argmax(logits, -1)
+        else:
+            # Post process logits
+            logits.div_(sampling_info.temperatures)
+            probs = torch.softmax(logits, dim=-1)
+            logits = None
+            del logits
+            if global_server_args_dict["sampling_backend"] == "flashinfer":
+                max_top_k_round, batch_size = 32, probs.shape[0]
+                uniform_samples = torch.rand(
+                    (max_top_k_round, batch_size), device=probs.device
                 )
-            else:
-                batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                if sampling_info.need_min_p_sampling:
+                    probs = top_k_renorm_prob(probs, sampling_info.top_ks)
+                    probs = top_p_renorm_prob(probs, sampling_info.top_ps)
+                    batch_next_token_ids, success = min_p_sampling_from_probs(
+                        probs, uniform_samples, sampling_info.min_ps
+                    )
+                else:
+                    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                        probs,
+                        uniform_samples,
+                        sampling_info.top_ks,
+                        sampling_info.top_ps,
+                        filter_apply_order="joint",
+                    )
+                if not torch.all(success):
+                    logger.warning("Detected errors during sampling!")
+                    batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
+            elif global_server_args_dict["sampling_backend"] == "pytorch":
+                # A slower fallback implementation with torch native operations.
+                batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
                     probs,
-                    uniform_samples,
                     sampling_info.top_ks,
                     sampling_info.top_ps,
-                    filter_apply_order="joint",
+                    sampling_info.min_ps,
+                )
+            else:
+                raise ValueError(
+                    f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
                 )
-            if not torch.all(success):
-                logger.warning("Detected errors during sampling!")
-                batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
-        elif global_server_args_dict["sampling_backend"] == "pytorch":
-            # Here we provide a slower fallback implementation.
-            batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
-                probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
-            )
-        else:
-            raise ValueError(
-                f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
-            )
-        return batch_next_token_ids
+        return batch_next_token_ids.to(torch.int32)
 def top_k_top_p_min_p_sampling_from_probs_torch(

sglang/srt/lora/lora.py CHANGED Viewed

@@ -351,7 +351,9 @@ class LoRAAdapter(nn.Module):
         loader = DefaultModelLoader(self.load_config)
         revision = getattr(self.config.hf_config, "revision", None)
         for name, loaded_weight in loader._get_weights_iterator(
-            model_path, revision=revision, fall_back_to_pt=True
+            DefaultModelLoader.Source(
+                model_path, revision=revision, fall_back_to_pt=True
+            )
         ):
             match = re.search(r"layers\.(\d+)\.", name)
             if match is not None:

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -156,7 +156,7 @@ class DataParallelController:
                 else:
                     # Send other control messages to all workers
                     for worker in self.workers:
-                        worker.queue.put(recv_req)
+                        worker.send_pyobj(recv_req)
 def run_data_parallel_controller_process(

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -27,6 +27,7 @@ from sglang.srt.managers.io_struct import (
     BatchEmbeddingOut,
     BatchStrOut,
     BatchTokenIDOut,
+    GetMemPoolSizeReqOutput,
     UpdateWeightReqOutput,
 )
 from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
@@ -111,6 +112,9 @@ class DetokenizerManager:
                 # If it is a weight update request, no detokenization is needed.
                 self.send_to_tokenizer.send_pyobj(recv_obj)
                 continue
+            elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
+                self.send_to_tokenizer.send_pyobj(recv_obj)
+                continue
             elif self.tokenizer is None:
                 # If the tokenizer is skipped, no detokenization is needed
                 self.send_to_tokenizer.send_pyobj(recv_obj)

sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl