PyPI - sglang - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl - Mend

sglang 0.3.1py3-none-any.whl → 0.3.1.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

sglang/bench_latency.py +7 -2
sglang/global_config.py +5 -13
sglang/lang/interpreter.py +0 -3
sglang/srt/constrained/fsm_cache.py +5 -1
sglang/srt/layers/activation.py +12 -0
sglang/srt/layers/attention_backend.py +12 -12
sglang/srt/layers/fused_moe/layer.py +27 -7
sglang/srt/layers/layernorm.py +12 -0
sglang/srt/layers/sampler.py +32 -97
sglang/srt/lora/lora_manager.py +11 -8
sglang/srt/managers/schedule_batch.py +1 -0
sglang/srt/managers/tp_worker.py +8 -7
sglang/srt/model_executor/cuda_graph_runner.py +12 -1
sglang/srt/model_executor/model_runner.py +24 -41
sglang/srt/models/deepseek_v2.py +6 -1
sglang/srt/models/minicpm3.py +5 -1
sglang/srt/models/olmoe.py +415 -0
sglang/srt/sampling/sampling_batch_info.py +3 -50
sglang/srt/server.py +6 -1
sglang/srt/server_args.py +34 -1
sglang/srt/utils.py +7 -51
sglang/test/test_utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.3.1.dist-info → sglang-0.3.1.post1.dist-info}/METADATA +2 -2
{sglang-0.3.1.dist-info → sglang-0.3.1.post1.dist-info}/RECORD +28 -27
{sglang-0.3.1.dist-info → sglang-0.3.1.post1.dist-info}/WHEEL +1 -1
{sglang-0.3.1.dist-info → sglang-0.3.1.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.1.dist-info → sglang-0.3.1.post1.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -63,7 +63,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import suppress_other_loggers
+from sglang.srt.utils import kill_child_process, suppress_other_loggers
 @dataclasses.dataclass
@@ -502,4 +502,9 @@ if __name__ == "__main__":
         format="%(message)s",
     )
-    main(server_args, bench_args)
+    try:
+        main(server_args, bench_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_child_process(os.getpid(), including_parent=False)

sglang/global_config.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Global configurations"""
+import os
 class GlobalConfig:
     def __init__(self):
@@ -16,30 +18,20 @@ class GlobalConfig:
         self.base_min_new_token_ratio = 0.1
         self.new_token_ratio_decay = 0.001
-        # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
-        # This can improve the speed for large batch sizes during prefill.
-        self.layer_sync_threshold = 8192
         # Runtime constants: others
         self.num_continue_decode_steps = 10
         self.retract_decode_steps = 20
-        self.flashinfer_workspace_size = 384 * 1024 * 1024
+        self.flashinfer_workspace_size = os.environ.get(
+            "FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024
+        )
         # Output tokenization configs
         self.skip_special_tokens_in_output = True
         self.spaces_between_special_tokens_in_out = True
         # Interpreter optimization configs
-        self.eager_fill_image = False
         self.enable_precache_with_tracing = True
         self.enable_parallel_encoding = True
-        self.enable_parallel_decoding = True
-        # Deprecated
-        # Choices: ["no_adjust", "adjust_cache"]
-        # no_adjust: Do not adjust the position embedding of KV cache.
-        # adjust_cache: Adjust the position embedding of KV cache.
-        self.concate_and_append_mode = "no_adjust"
 global_config = GlobalConfig()

sglang/lang/interpreter.py CHANGED Viewed

@@ -434,9 +434,6 @@ class StreamExecutor:
         self.cur_images.append((path, base64_data))
         self.text_ += self.chat_template.image_token
-        # if global_config.eager_fill_image:
-        #     self.backend.fill_image(self)
     def _spec_gen(self, sampling_params):
         stop = sampling_params.stop
         max_new_tokens = sampling_params.max_new_tokens

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -29,6 +29,7 @@ class FSMCache(BaseToolCache):
         tokenizer_args_dict,
         enable=True,
         skip_tokenizer_init=False,
+        constrained_json_whitespace_pattern=None,
     ):
         super().__init__(enable=enable)
@@ -63,11 +64,14 @@ class FSMCache(BaseToolCache):
             self.outlines_tokenizer.vocabulary = (
                 self.outlines_tokenizer.tokenizer.get_vocab()
             )
+        self.constrained_json_whitespace_pattern = constrained_json_whitespace_pattern
     def init_value(self, key):
         key_type, key_string = key
         if key_type == "json":
-            regex = build_regex_from_schema(key_string, whitespace_pattern=r"[\n\t ]*")
+            regex = build_regex_from_schema(
+                key_string, whitespace_pattern=self.constrained_json_whitespace_pattern
+            )
         elif key_type == "regex":
             regex = key_string
         else:

sglang/srt/layers/activation.py CHANGED Viewed

@@ -13,6 +13,7 @@ limitations under the License.
 """Fused operators for activation layers."""
+import logging
 from typing import Optional
 import torch
@@ -28,6 +29,10 @@ from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
+from sglang.srt.utils import is_hip
+logger = logging.getLogger(__name__)
 class SiluAndMul(CustomOp):
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -135,3 +140,10 @@ def get_act_fn(
             act_fn, intermediate_size, input_is_parallel, params_dtype
         )
     return act_fn
+if is_hip():
+    logger.info(
+        "FlashInfer is not available on AMD GPUs. Fallback to other kernel libraries."
+    )
+    from vllm.model_executor.layers.activation import GeluAndMul, SiluAndMul

sglang/srt/layers/attention_backend.py CHANGED Viewed

@@ -12,22 +12,26 @@ from typing import TYPE_CHECKING
 import torch
 import torch.nn as nn
-from flashinfer import (
-    BatchDecodeWithPagedKVCacheWrapper,
-    BatchPrefillWithPagedKVCacheWrapper,
-    BatchPrefillWithRaggedKVCacheWrapper,
-)
-from flashinfer.cascade import merge_state
-from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
 from sglang.global_config import global_config
 from sglang.srt.layers.flashinfer_utils import update_flashinfer_indices
 from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
+from sglang.srt.utils import is_hip
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
+# ROCm: flashinfer available later
+if not is_hip():
+    from flashinfer import (
+        BatchDecodeWithPagedKVCacheWrapper,
+        BatchPrefillWithPagedKVCacheWrapper,
+        BatchPrefillWithRaggedKVCacheWrapper,
+    )
+    from flashinfer.cascade import merge_state
+    from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
 class AttentionBackend(ABC):
     """The base class of attention backends"""
@@ -150,7 +154,7 @@ class FlashInferAttnBackend(AttentionBackend):
             # Some heuristics to check whether to use ragged forward
             use_ragged = False
             if (
-                int(torch.sum(input_metadata.seq_lens)) > 4096
+                torch.sum(input_metadata.seq_lens).item() >= 4096
                 and self.model_runner.sliding_window_size is None
             ):
                 use_ragged = True
@@ -301,10 +305,6 @@ class FlashInferAttnBackend(AttentionBackend):
                 layer.layer_id, input_metadata.out_cache_loc, k, v
             )
-            if total_num_tokens >= global_config.layer_sync_threshold:
-                # TODO: Revisit this. Why is this synchronize needed?
-                torch.cuda.synchronize()
         return o.view(-1, layer.tp_q_head_num * layer.head_dim)
     def forward_decode(self, q, k, v, layer: nn.Module, input_metadata: InputMetadata):

sglang/srt/layers/fused_moe/layer.py CHANGED Viewed

@@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.utils import set_weight_attrs
+from sglang.srt.utils import is_hip
 logger = init_logger(__name__)
@@ -381,6 +383,7 @@ from torch.nn import Module
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d,
+    normalize_e4m3fn_to_e4m3fnuz,
     per_tensor_dequantize,
 )
 from vllm.utils import print_warning_once
@@ -479,14 +482,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer: Module) -> None:
-        # If checkpoint is fp16, quantize in place.
+        # If checkpoint is fp16 or bfloat16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
-            w13_weight = torch.empty_like(
-                layer.w13_weight.data, dtype=torch.float8_e4m3fn
-            )
-            w2_weight = torch.empty_like(
-                layer.w2_weight.data, dtype=torch.float8_e4m3fn
-            )
+            # If ROCm, use float8_e4m3fnuz instead (MI300x HW)
+            fp8_dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
+            w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
+            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
             # Re-initialize w13_scale because we directly quantize
             # merged w13 weights and generate a single scaling factor.
@@ -534,6 +535,25 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     layer.a2_scale.max(), requires_grad=False
                 )
+            # If ROCm, normalize the weights and scales to e4m3fnuz
+            if is_hip():
+                # Normalize the weights and scales
+                w13_weight, w13_scale, a13_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w13_weight, layer.w13_scale, layer.a13_scale
+                )
+                w2_weight, w2_scale, a2_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w2_weight, layer.w2_scale, layer.a2_scale
+                )
+                # Reset the parameters
+                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+                layer.w13_scale = torch.nn.Parameter(w13_scale, requires_grad=False)
+                if a13_scale is not None:
+                    layer.a13_scale = torch.nn.Parameter(a13_scale, requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+                layer.w2_scale = torch.nn.Parameter(w2_scale, requires_grad=False)
+                if a2_scale is not None:
+                    layer.a2_scale = torch.nn.Parameter(a2_scale, requires_grad=False)
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max then dequant and requant each expert.
             assert layer.w13_scale is not None

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -15,6 +15,7 @@ limitations under the License.
 """Fused operators for normalization layers."""
+import logging
 from typing import Optional, Tuple, Union
 import torch
@@ -27,6 +28,10 @@ from flashinfer.norm import (
 )
 from vllm.model_executor.custom_op import CustomOp
+from sglang.srt.utils import is_hip
+logger = logging.getLogger(__name__)
 class RMSNorm(CustomOp):
     def __init__(
@@ -109,3 +114,10 @@ class GemmaRMSNorm(CustomOp):
             return x, residual
         out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
         return out
+if is_hip():
+    logger.info(
+        "FlashInfer is not available on AMD GPUs. Fallback to other kernel libraries."
+    )
+    from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,51 +1,28 @@
-import dataclasses
 import logging
-from typing import Tuple, Union
+from typing import Union
 import torch
-from flashinfer.sampling import (
-    min_p_sampling_from_probs,
-    top_k_renorm_prob,
-    top_k_top_p_sampling_from_probs,
-    top_p_renorm_prob,
-)
-from torch.library import custom_op as torch_custom_op
-from vllm.model_executor.custom_op import CustomOp
+from torch import nn
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-# TODO: move this dict to another place
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.utils import is_hip
+# ROCm: flashinfer available later
+if not is_hip():
+    from flashinfer.sampling import (
+        min_p_sampling_from_probs,
+        top_k_renorm_prob,
+        top_k_top_p_sampling_from_probs,
+        top_p_renorm_prob,
+    )
 logger = logging.getLogger(__name__)
-@dataclasses.dataclass
-class SampleOutput:
-    success: torch.Tensor
-    probs: torch.Tensor
-    batch_next_token_ids: torch.Tensor
-class Sampler(CustomOp):
-    def __init__(self):
-        super().__init__()
-        # FIXME: torch.multinomial has too many bugs
-        self.forward_native = self.forward_cuda
-        self.is_torch_compile = False
-    def _get_probs(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
-        # Post process logits
-        logits = logits.contiguous()
-        logits.div_(sampling_info.temperatures)
-        if self.is_torch_compile:
-            # FIXME: Temporary workaround for unknown bugs in torch.compile
-            logits.add_(0)
-        return torch.softmax(logits, dim=-1)
-    def forward_cuda(
+class Sampler(nn.Module):
+    def forward(
         self,
         logits: Union[torch.Tensor, LogitsProcessorOutput],
         sampling_info: SamplingBatchInfo,
@@ -53,7 +30,15 @@ class Sampler(CustomOp):
         if isinstance(logits, LogitsProcessorOutput):
             logits = logits.next_token_logits
-        probs = self._get_probs(logits, sampling_info)
+        # Post process logits
+        logits.div_(sampling_info.temperatures)
+        probs = logits[:] = torch.softmax(logits, dim=-1)
+        if torch.any(torch.isnan(probs)):
+            logger.warning("Detected errors during sampling! NaN in the probability.")
+            probs = torch.where(
+                torch.isnan(probs), torch.full_like(probs, 1e-10), probs
+            )
         if global_server_args_dict["sampling_backend"] == "flashinfer":
             max_top_k_round, batch_size = 32, probs.shape[0]
@@ -67,12 +52,16 @@ class Sampler(CustomOp):
                     probs, uniform_samples, sampling_info.min_ps
                 )
             else:
-                batch_next_token_ids, success = flashinfer_top_k_top_p(
+                batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
                     probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps
                 )
+            if not torch.all(success):
+                logger.warning("Detected errors during sampling!")
+                batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
         elif global_server_args_dict["sampling_backend"] == "pytorch":
             # Here we provide a slower fallback implementation.
-            batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
+            batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
                 probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
             )
         else:
@@ -80,48 +69,7 @@ class Sampler(CustomOp):
                 f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
             )
-        return SampleOutput(success, probs, batch_next_token_ids)
-    def forward_native(
-        self,
-        logits: Union[torch.Tensor, LogitsProcessorOutput],
-        sampling_info: SamplingBatchInfo,
-    ):
-        if isinstance(logits, LogitsProcessorOutput):
-            logits = logits.next_token_logits
-        probs = self._get_probs(logits, sampling_info)
-        batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
-            probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
-        )
-        return SampleOutput(success, probs, batch_next_token_ids)
-@torch_custom_op("my_lib::flashinfer_top_k_top_p", mutates_args={})
-def flashinfer_top_k_top_p(
-    probs: torch.Tensor,
-    uniform_samples: torch.Tensor,
-    top_ks: torch.Tensor,
-    top_ps: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # NOTE: we do not use min_p neither in CUDA nor in torch.compile
-    return top_k_top_p_sampling_from_probs(probs, uniform_samples, top_ks, top_ps)
-@flashinfer_top_k_top_p.register_fake
-def _(
-    probs: torch.Tensor,
-    uniform_samples: torch.Tensor,
-    top_ks: torch.Tensor,
-    top_ps: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    bs = probs.shape[0]
-    return (
-        torch.ones(bs, dtype=torch.bool, device=probs.device),
-        torch.zeros(bs, dtype=torch.int32, device=probs.device),
-    )
+        return batch_next_token_ids
 def top_k_top_p_min_p_sampling_from_probs_torch(
@@ -141,19 +89,6 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     ] = 0.0
     probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
     probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0])
-    try:
-        # FIXME: torch.multiomial does not support num_samples = 1
-        sampled_index = torch.multinomial(probs_sort, num_samples=2, replacement=True)[
-            :, :1
-        ]
-    except RuntimeError as e:
-        logger.warning(f"Sampling error: {e}")
-        batch_next_token_ids = torch.zeros(
-            (probs_sort.shape[0],), dtype=torch.int32, device=probs.device
-        )
-        success = torch.zeros(probs.shape[0], dtype=torch.bool, device=probs.device)
-        return batch_next_token_ids, success
+    sampled_index = torch.multinomial(probs_sort, num_samples=1)
     batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1)
-    success = torch.ones(probs.shape[0], dtype=torch.bool, device=probs.device)
-    return batch_next_token_ids, success
+    return batch_next_token_ids

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -21,12 +21,15 @@ import re
 from dataclasses import dataclass
 import torch
-from flashinfer import SegmentGEMMWrapper
 from sglang.srt.lora.lora import LoRAAdapter, get_lora_layer
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
-from sglang.srt.utils import replace_submodule
+from sglang.srt.utils import is_hip, replace_submodule
+# ROCm: flashinfer available later
+if not is_hip():
+    from flashinfer import SegmentGEMMWrapper
 def get_stacked_name(name):
@@ -96,10 +99,10 @@ class LoRAManager:
         # get configs and target modules
         self.configs = {}
         self.origin_target_modules = set()
-        for path in self.lora_paths:
-            self.configs[path] = LoRAConfig(path)
+        for name, path in self.lora_paths.items():
+            self.configs[name] = LoRAConfig(path)
             self.origin_target_modules = set(self.origin_target_modules) | set(
-                self.configs[path].target_modules
+                self.configs[name].target_modules
             )
         self.target_modules = set(
             [
@@ -114,11 +117,11 @@ class LoRAManager:
         # load all weights to cpu
         self.loras = []
         self.lora_id = {}
-        for path in self.lora_paths:
-            self.lora_id[path] = len(self.loras)
+        for name in self.lora_paths.keys():
+            self.lora_id[name] = len(self.loras)
             self.loras.append(
                 LoRAAdapter(
-                    path, self.configs[path], self.base_hf_config, self.load_config
+                    name, self.configs[name], self.base_hf_config, self.load_config
                 )
             )
             self.loras[-1].initialize_weights()

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -360,6 +360,7 @@ class ScheduleBatch:
     tree_cache: BasePrefixCache
     forward_mode: ForwardMode = None
+    sampling_info: SamplingBatchInfo = None
     # Batched arguments to model runner
     input_ids: torch.Tensor = None

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -198,6 +198,7 @@ class ModelTpServer:
                     "trust_remote_code": server_args.trust_remote_code,
                 },
                 skip_tokenizer_init=server_args.skip_tokenizer_init,
+                constrained_json_whitespace_pattern=server_args.constrained_json_whitespace_pattern,
             )
         self.jump_forward_cache = JumpForwardCache()
@@ -414,7 +415,7 @@ class ModelTpServer:
         # Truncate prompts that are too long
         if len(req.origin_input_ids) >= self.max_req_input_len:
-            logger.warn(
+            logger.warning(
                 "Request length is longer than the KV cache pool size or "
                 "the max context length. Truncated!!!"
             )
@@ -807,12 +808,10 @@ class ModelTpServer:
                 unfinished_indices.append(i)
             if req.finished() or (
-                (
-                    req.stream
-                    and (
-                        self.decode_forward_ct % self.stream_interval == 0
-                        or len(req.output_ids) == 1
-                    )
+                req.stream
+                and (
+                    self.decode_forward_ct % self.stream_interval == 0
+                    or len(req.output_ids) == 1
                 )
             ):
                 output_rids.append(req.rid)
@@ -937,6 +936,8 @@ class ModelTpServer:
         if success:
             flash_cache_success = self.flush_cache()
             assert flash_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
         return success, message

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -41,6 +41,9 @@ if TYPE_CHECKING:
 def _to_torch(model: torch.nn.Module, reverse: bool = False):
     for sub in model._modules.values():
         if isinstance(sub, CustomOp):
+            # NOTE: FusedMoE torch native implementaiton is not efficient
+            if "FusedMoE" in sub.__class__.__name__:
+                continue
             if reverse:
                 sub._forward_method = sub.forward_cuda
                 setattr(sub, "is_torch_compile", False)
@@ -105,7 +108,15 @@ class CudaGraphRunner:
             self.capture_bs = list(range(1, 32)) + [64, 128]
         else:
             self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
-        self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if self.use_torch_compile else []
+        self.compile_bs = (
+            [
+                bs
+                for bs in self.capture_bs
+                if bs <= self.model_runner.server_args.max_torch_compile_bs
+            ]
+            if self.use_torch_compile
+            else []
+        )
         # Common inputs
         self.max_bs = max(self.capture_bs)

sglang 0.3.1__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl

sglang 0.3.1py3-none-any.whl → 0.3.1.post1py3-none-any.whl