PyPI - sglang - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl - Mend

sglang 0.3.1py3-none-any.whl → 0.3.1.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

sglang/bench_latency.py +10 -3
sglang/bench_server_latency.py +187 -0
sglang/bench_serving.py +1 -1
sglang/global_config.py +5 -13
sglang/lang/interpreter.py +0 -3
sglang/srt/constrained/fsm_cache.py +5 -1
sglang/srt/layers/activation.py +16 -1
sglang/srt/layers/attention_backend.py +12 -12
sglang/srt/layers/fused_moe/layer.py +27 -7
sglang/srt/layers/layernorm.py +21 -6
sglang/srt/layers/sampler.py +40 -98
sglang/srt/lora/lora_manager.py +11 -8
sglang/srt/managers/io_struct.py +3 -0
sglang/srt/managers/policy_scheduler.py +49 -93
sglang/srt/managers/schedule_batch.py +2 -1
sglang/srt/managers/tp_worker.py +19 -13
sglang/srt/model_executor/cuda_graph_runner.py +25 -13
sglang/srt/model_executor/model_runner.py +37 -46
sglang/srt/models/deepseek_v2.py +8 -3
sglang/srt/models/llama.py +1 -3
sglang/srt/models/llama_classification.py +2 -3
sglang/srt/models/minicpm3.py +7 -3
sglang/srt/models/olmoe.py +415 -0
sglang/srt/models/xverse.py +1 -3
sglang/srt/models/xverse_moe.py +1 -4
sglang/srt/sampling/sampling_batch_info.py +3 -50
sglang/srt/server.py +6 -1
sglang/srt/server_args.py +39 -10
sglang/srt/utils.py +7 -51
sglang/test/few_shot_gsm8k.py +8 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/METADATA +4 -5
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/RECORD +37 -35
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/WHEEL +1 -1
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.1.dist-info → sglang-0.3.1.post2.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """
-Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
+Benchmark the latency of running a single static batch.
+This script does not launch a server and uses the low-level APIs.
+It accepts arguments similar to those of launch_server.py.
 # Usage (latency test)
 ## with dummy weights:
@@ -63,7 +65,7 @@ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import suppress_other_loggers
+from sglang.srt.utils import kill_child_process, suppress_other_loggers
 @dataclasses.dataclass
@@ -502,4 +504,9 @@ if __name__ == "__main__":
         format="%(message)s",
     )
-    main(server_args, bench_args)
+    try:
+        main(server_args, bench_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_child_process(os.getpid(), including_parent=False)

sglang/bench_server_latency.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""
+Benchmark the latency of serving a single batch with a real server.
+This script launches a server and uses the HTTP interface.
+It accepts arguments similar to those of launch_server.py.
+Usage:
+python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
+"""
+import argparse
+import dataclasses
+import itertools
+import json
+import multiprocessing
+import os
+import time
+from typing import Tuple
+import numpy as np
+import requests
+from sglang.srt.server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_child_process
+@dataclasses.dataclass
+class BenchArgs:
+    run_name: str = "default"
+    batch_size: Tuple[int] = (1,)
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (16,)
+    result_filename: str = "result.jsonl"
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to case the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_child_process(os.getpid(), including_parent=False)
+def launch_server_process(server_args: ServerArgs):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = 600
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            response = requests.get(f"{base_url}/v1/models", headers=headers)
+            if response.status_code == 200:
+                return proc, base_url
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError("Server failed to start within the timeout period.")
+def run_one_case(
+    url: str,
+    batch_size: int,
+    input_len: int,
+    output_len: int,
+    run_name: str,
+    result_filename: str,
+):
+    input_ids = [
+        [int(x) for x in np.random.randint(0, high=16384, size=(input_len,))]
+        for _ in range(batch_size)
+    ]
+    tic = time.time()
+    response = requests.post(
+        url + "/generate",
+        json={
+            "input_ids": input_ids,
+            "sampling_params": {
+                "temperature": 0,
+                "max_new_tokens": output_len,
+                "ignore_eos": True,
+            },
+        },
+    )
+    latency = time.time() - tic
+    _ = response.json()
+    output_throughput = batch_size * output_len / latency
+    overall_throughput = batch_size * (input_len + output_len) / latency
+    print(f"batch size: {batch_size}")
+    print(f"latency: {latency:.2f} s")
+    print(f"output throughput: {output_throughput:.2f} token/s")
+    print(f"(input + output) throughput: {overall_throughput:.2f} token/s")
+    if result_filename:
+        with open(result_filename, "a") as fout:
+            res = {
+                "run_name": run_name,
+                "batch_size": batch_size,
+                "input_len": input_len,
+                "output_len": output_len,
+                "latency": round(latency, 4),
+                "output_throughput": round(output_throughput, 2),
+                "overall_throughput": round(overall_throughput, 2),
+            }
+            fout.write(json.dumps(res) + "\n")
+def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
+    proc, base_url = launch_server_process(server_args)
+    # warmup
+    run_one_case(
+        base_url,
+        batch_size=16,
+        input_len=1024,
+        output_len=16,
+        run_name="",
+        result_filename="",
+    )
+    # benchmark
+    try:
+        for bs, il, ol in itertools.product(
+            bench_args.batch_size, bench_args.input_len, bench_args.output_len
+        ):
+            run_one_case(
+                base_url,
+                bs,
+                il,
+                ol,
+                bench_args.run_name,
+                bench_args.result_filename,
+            )
+    finally:
+        kill_child_process(proc.pid)
+    print(f"\nResults are saved to {bench_args.result_filename}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    # For this script, model-path is not required
+    assert (
+        parser._actions[1].option_strings[0] == "--model-path"
+    ), "options changed, this code need to be updated"
+    parser._actions[1].required = False
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+    run_benchmark(server_args, bench_args)

sglang/bench_serving.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
 """
-Benchmark online serving.
+Benchmark online serving with dynamic requests.
 Usage:
 python3 -m sglang.bench_serving --backend sglang --num-prompt 10

sglang/global_config.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Global configurations"""
+import os
 class GlobalConfig:
     def __init__(self):
@@ -16,30 +18,20 @@ class GlobalConfig:
         self.base_min_new_token_ratio = 0.1
         self.new_token_ratio_decay = 0.001
-        # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
-        # This can improve the speed for large batch sizes during prefill.
-        self.layer_sync_threshold = 8192
         # Runtime constants: others
         self.num_continue_decode_steps = 10
         self.retract_decode_steps = 20
-        self.flashinfer_workspace_size = 384 * 1024 * 1024
+        self.flashinfer_workspace_size = os.environ.get(
+            "FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024
+        )
         # Output tokenization configs
         self.skip_special_tokens_in_output = True
         self.spaces_between_special_tokens_in_out = True
         # Interpreter optimization configs
-        self.eager_fill_image = False
         self.enable_precache_with_tracing = True
         self.enable_parallel_encoding = True
-        self.enable_parallel_decoding = True
-        # Deprecated
-        # Choices: ["no_adjust", "adjust_cache"]
-        # no_adjust: Do not adjust the position embedding of KV cache.
-        # adjust_cache: Adjust the position embedding of KV cache.
-        self.concate_and_append_mode = "no_adjust"
 global_config = GlobalConfig()

sglang/lang/interpreter.py CHANGED Viewed

@@ -434,9 +434,6 @@ class StreamExecutor:
         self.cur_images.append((path, base64_data))
         self.text_ += self.chat_template.image_token
-        # if global_config.eager_fill_image:
-        #     self.backend.fill_image(self)
     def _spec_gen(self, sampling_params):
         stop = sampling_params.stop
         max_new_tokens = sampling_params.max_new_tokens

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -29,6 +29,7 @@ class FSMCache(BaseToolCache):
         tokenizer_args_dict,
         enable=True,
         skip_tokenizer_init=False,
+        constrained_json_whitespace_pattern=None,
     ):
         super().__init__(enable=enable)
@@ -63,11 +64,14 @@ class FSMCache(BaseToolCache):
             self.outlines_tokenizer.vocabulary = (
                 self.outlines_tokenizer.tokenizer.get_vocab()
             )
+        self.constrained_json_whitespace_pattern = constrained_json_whitespace_pattern
     def init_value(self, key):
         key_type, key_string = key
         if key_type == "json":
-            regex = build_regex_from_schema(key_string, whitespace_pattern=r"[\n\t ]*")
+            regex = build_regex_from_schema(
+                key_string, whitespace_pattern=self.constrained_json_whitespace_pattern
+            )
         elif key_type == "regex":
             regex = key_string
         else:

sglang/srt/layers/activation.py CHANGED Viewed

@@ -13,12 +13,18 @@ limitations under the License.
 """Fused operators for activation layers."""
+import logging
 from typing import Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+from sglang.srt.utils import is_hip
+if not is_hip():
+    from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 from vllm.distributed import (
     divide,
     get_tensor_model_parallel_rank,
@@ -28,6 +34,8 @@ from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
+logger = logging.getLogger(__name__)
 class SiluAndMul(CustomOp):
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -135,3 +143,10 @@ def get_act_fn(
             act_fn, intermediate_size, input_is_parallel, params_dtype
         )
     return act_fn
+if is_hip():
+    logger.info(
+        "FlashInfer is not available on AMD GPUs. Fallback to other kernel libraries."
+    )
+    from vllm.model_executor.layers.activation import GeluAndMul, SiluAndMul

sglang/srt/layers/attention_backend.py CHANGED Viewed

@@ -12,22 +12,26 @@ from typing import TYPE_CHECKING
 import torch
 import torch.nn as nn
-from flashinfer import (
-    BatchDecodeWithPagedKVCacheWrapper,
-    BatchPrefillWithPagedKVCacheWrapper,
-    BatchPrefillWithRaggedKVCacheWrapper,
-)
-from flashinfer.cascade import merge_state
-from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
 from sglang.global_config import global_config
 from sglang.srt.layers.flashinfer_utils import update_flashinfer_indices
 from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
+from sglang.srt.utils import is_hip
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
+# ROCm: flashinfer available later
+if not is_hip():
+    from flashinfer import (
+        BatchDecodeWithPagedKVCacheWrapper,
+        BatchPrefillWithPagedKVCacheWrapper,
+        BatchPrefillWithRaggedKVCacheWrapper,
+    )
+    from flashinfer.cascade import merge_state
+    from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
 class AttentionBackend(ABC):
     """The base class of attention backends"""
@@ -150,7 +154,7 @@ class FlashInferAttnBackend(AttentionBackend):
             # Some heuristics to check whether to use ragged forward
             use_ragged = False
             if (
-                int(torch.sum(input_metadata.seq_lens)) > 4096
+                torch.sum(input_metadata.seq_lens).item() >= 4096
                 and self.model_runner.sliding_window_size is None
             ):
                 use_ragged = True
@@ -301,10 +305,6 @@ class FlashInferAttnBackend(AttentionBackend):
                 layer.layer_id, input_metadata.out_cache_loc, k, v
             )
-            if total_num_tokens >= global_config.layer_sync_threshold:
-                # TODO: Revisit this. Why is this synchronize needed?
-                torch.cuda.synchronize()
         return o.view(-1, layer.tp_q_head_num * layer.head_dim)
     def forward_decode(self, q, k, v, layer: nn.Module, input_metadata: InputMetadata):

sglang/srt/layers/fused_moe/layer.py CHANGED Viewed

@@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.utils import set_weight_attrs
+from sglang.srt.utils import is_hip
 logger = init_logger(__name__)
@@ -381,6 +383,7 @@ from torch.nn import Module
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d,
+    normalize_e4m3fn_to_e4m3fnuz,
     per_tensor_dequantize,
 )
 from vllm.utils import print_warning_once
@@ -479,14 +482,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
     def process_weights_after_loading(self, layer: Module) -> None:
-        # If checkpoint is fp16, quantize in place.
+        # If checkpoint is fp16 or bfloat16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
-            w13_weight = torch.empty_like(
-                layer.w13_weight.data, dtype=torch.float8_e4m3fn
-            )
-            w2_weight = torch.empty_like(
-                layer.w2_weight.data, dtype=torch.float8_e4m3fn
-            )
+            # If ROCm, use float8_e4m3fnuz instead (MI300x HW)
+            fp8_dtype = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
+            w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
+            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
             # Re-initialize w13_scale because we directly quantize
             # merged w13 weights and generate a single scaling factor.
@@ -534,6 +535,25 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     layer.a2_scale.max(), requires_grad=False
                 )
+            # If ROCm, normalize the weights and scales to e4m3fnuz
+            if is_hip():
+                # Normalize the weights and scales
+                w13_weight, w13_scale, a13_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w13_weight, layer.w13_scale, layer.a13_scale
+                )
+                w2_weight, w2_scale, a2_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    layer.w2_weight, layer.w2_scale, layer.a2_scale
+                )
+                # Reset the parameters
+                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+                layer.w13_scale = torch.nn.Parameter(w13_scale, requires_grad=False)
+                if a13_scale is not None:
+                    layer.a13_scale = torch.nn.Parameter(a13_scale, requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+                layer.w2_scale = torch.nn.Parameter(w2_scale, requires_grad=False)
+                if a2_scale is not None:
+                    layer.a2_scale = torch.nn.Parameter(a2_scale, requires_grad=False)
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max then dequant and requant each expert.
             assert layer.w13_scale is not None

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -15,18 +15,26 @@ limitations under the License.
 """Fused operators for normalization layers."""
+import logging
 from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
-from flashinfer.norm import (
-    fused_add_rmsnorm,
-    gemma_fused_add_rmsnorm,
-    gemma_rmsnorm,
-    rmsnorm,
-)
+from sglang.srt.utils import is_hip
+if not is_hip():
+    from flashinfer.norm import (
+        fused_add_rmsnorm,
+        gemma_fused_add_rmsnorm,
+        gemma_rmsnorm,
+        rmsnorm,
+    )
 from vllm.model_executor.custom_op import CustomOp
+logger = logging.getLogger(__name__)
 class RMSNorm(CustomOp):
     def __init__(
@@ -109,3 +117,10 @@ class GemmaRMSNorm(CustomOp):
             return x, residual
         out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
         return out
+if is_hip():
+    logger.info(
+        "FlashInfer is not available on AMD GPUs. Fallback to other kernel libraries."
+    )
+    from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,51 +1,28 @@
-import dataclasses
 import logging
-from typing import Tuple, Union
+from typing import Union
 import torch
-from flashinfer.sampling import (
-    min_p_sampling_from_probs,
-    top_k_renorm_prob,
-    top_k_top_p_sampling_from_probs,
-    top_p_renorm_prob,
-)
-from torch.library import custom_op as torch_custom_op
-from vllm.model_executor.custom_op import CustomOp
+from torch import nn
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
-# TODO: move this dict to another place
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.utils import is_hip
+# ROCm: flashinfer available later
+if not is_hip():
+    from flashinfer.sampling import (
+        min_p_sampling_from_probs,
+        top_k_renorm_prob,
+        top_k_top_p_sampling_from_probs,
+        top_p_renorm_prob,
+    )
 logger = logging.getLogger(__name__)
-@dataclasses.dataclass
-class SampleOutput:
-    success: torch.Tensor
-    probs: torch.Tensor
-    batch_next_token_ids: torch.Tensor
-class Sampler(CustomOp):
-    def __init__(self):
-        super().__init__()
-        # FIXME: torch.multinomial has too many bugs
-        self.forward_native = self.forward_cuda
-        self.is_torch_compile = False
-    def _get_probs(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
-        # Post process logits
-        logits = logits.contiguous()
-        logits.div_(sampling_info.temperatures)
-        if self.is_torch_compile:
-            # FIXME: Temporary workaround for unknown bugs in torch.compile
-            logits.add_(0)
-        return torch.softmax(logits, dim=-1)
-    def forward_cuda(
+class Sampler(nn.Module):
+    def forward(
         self,
         logits: Union[torch.Tensor, LogitsProcessorOutput],
         sampling_info: SamplingBatchInfo,
@@ -53,7 +30,18 @@ class Sampler(CustomOp):
         if isinstance(logits, LogitsProcessorOutput):
             logits = logits.next_token_logits
-        probs = self._get_probs(logits, sampling_info)
+        # Post process logits
+        logits = logits.contiguous()
+        logits.div_(sampling_info.temperatures)
+        probs = torch.softmax(logits, dim=-1)
+        logits = None
+        del logits
+        if torch.any(torch.isnan(probs)):
+            logger.warning("Detected errors during sampling! NaN in the probability.")
+            probs = torch.where(
+                torch.isnan(probs), torch.full_like(probs, 1e-10), probs
+            )
         if global_server_args_dict["sampling_backend"] == "flashinfer":
             max_top_k_round, batch_size = 32, probs.shape[0]
@@ -67,12 +55,20 @@ class Sampler(CustomOp):
                     probs, uniform_samples, sampling_info.min_ps
                 )
             else:
-                batch_next_token_ids, success = flashinfer_top_k_top_p(
-                    probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps
+                batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                    probs,
+                    uniform_samples,
+                    sampling_info.top_ks,
+                    sampling_info.top_ps,
+                    filter_apply_order="joint",
                 )
+            if not torch.all(success):
+                logger.warning("Detected errors during sampling!")
+                batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
         elif global_server_args_dict["sampling_backend"] == "pytorch":
             # Here we provide a slower fallback implementation.
-            batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
+            batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
                 probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
             )
         else:
@@ -80,48 +76,7 @@ class Sampler(CustomOp):
                 f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
             )
-        return SampleOutput(success, probs, batch_next_token_ids)
-    def forward_native(
-        self,
-        logits: Union[torch.Tensor, LogitsProcessorOutput],
-        sampling_info: SamplingBatchInfo,
-    ):
-        if isinstance(logits, LogitsProcessorOutput):
-            logits = logits.next_token_logits
-        probs = self._get_probs(logits, sampling_info)
-        batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
-            probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
-        )
-        return SampleOutput(success, probs, batch_next_token_ids)
-@torch_custom_op("my_lib::flashinfer_top_k_top_p", mutates_args={})
-def flashinfer_top_k_top_p(
-    probs: torch.Tensor,
-    uniform_samples: torch.Tensor,
-    top_ks: torch.Tensor,
-    top_ps: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # NOTE: we do not use min_p neither in CUDA nor in torch.compile
-    return top_k_top_p_sampling_from_probs(probs, uniform_samples, top_ks, top_ps)
-@flashinfer_top_k_top_p.register_fake
-def _(
-    probs: torch.Tensor,
-    uniform_samples: torch.Tensor,
-    top_ks: torch.Tensor,
-    top_ps: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    bs = probs.shape[0]
-    return (
-        torch.ones(bs, dtype=torch.bool, device=probs.device),
-        torch.zeros(bs, dtype=torch.int32, device=probs.device),
-    )
+        return batch_next_token_ids
 def top_k_top_p_min_p_sampling_from_probs_torch(
@@ -141,19 +96,6 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     ] = 0.0
     probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
     probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0])
-    try:
-        # FIXME: torch.multiomial does not support num_samples = 1
-        sampled_index = torch.multinomial(probs_sort, num_samples=2, replacement=True)[
-            :, :1
-        ]
-    except RuntimeError as e:
-        logger.warning(f"Sampling error: {e}")
-        batch_next_token_ids = torch.zeros(
-            (probs_sort.shape[0],), dtype=torch.int32, device=probs.device
-        )
-        success = torch.zeros(probs.shape[0], dtype=torch.bool, device=probs.device)
-        return batch_next_token_ids, success
+    sampled_index = torch.multinomial(probs_sort, num_samples=1)
     batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1)
-    success = torch.ones(probs.shape[0], dtype=torch.bool, device=probs.device)
-    return batch_next_token_ids, success
+    return batch_next_token_ids

sglang 0.3.1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl

sglang 0.3.1py3-none-any.whl → 0.3.1.post2py3-none-any.whl