PyPI - sglang - Versions diffs - 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

sglang/api.py +1 -1
sglang/bench_offline_throughput.py +19 -0
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +123 -79
sglang/global_config.py +8 -3
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/ir.py +1 -1
sglang/srt/_custom_ops.py +83 -91
sglang/srt/configs/load_config.py +4 -1
sglang/srt/configs/model_config.py +48 -2
sglang/srt/configs/qwen2_5_vl_config.py +5 -2
sglang/srt/constrained/base_grammar_backend.py +117 -15
sglang/srt/constrained/llguidance_backend.py +151 -0
sglang/srt/constrained/outlines_backend.py +24 -33
sglang/srt/constrained/xgrammar_backend.py +69 -38
sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
sglang/srt/distributed/parallel_state.py +48 -3
sglang/srt/entrypoints/engine.py +67 -9
sglang/srt/entrypoints/http_server.py +190 -41
sglang/srt/entrypoints/verl_engine.py +147 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/activation.py +11 -0
sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
sglang/srt/layers/attention/flashinfer_backend.py +220 -378
sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
sglang/srt/layers/attention/torch_native_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +9 -6
sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
sglang/srt/layers/attention/utils.py +39 -0
sglang/srt/layers/attention/vision.py +60 -63
sglang/srt/layers/dp_attention.py +142 -1
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +3 -1
sglang/srt/layers/logits_processor.py +281 -45
sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
sglang/srt/layers/moe/ep_moe/layer.py +140 -28
sglang/srt/layers/moe/fused_moe_native.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
sglang/srt/layers/moe/topk.py +13 -4
sglang/srt/layers/quantization/__init__.py +111 -7
sglang/srt/layers/quantization/blockwise_int8.py +409 -0
sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/fp8.py +69 -28
sglang/srt/layers/quantization/fp8_utils.py +17 -1
sglang/srt/layers/quantization/gptq.py +416 -0
sglang/srt/layers/quantization/int8_kernel.py +327 -0
sglang/srt/layers/quantization/int8_utils.py +73 -0
sglang/srt/layers/quantization/modelopt_quant.py +18 -1
sglang/srt/layers/radix_attention.py +1 -0
sglang/srt/layers/rotary_embedding.py +0 -1
sglang/srt/layers/sampler.py +76 -31
sglang/srt/layers/vocab_parallel_embedding.py +14 -13
sglang/srt/lora/lora.py +17 -1
sglang/srt/lora/lora_config.py +5 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/cache_controller.py +193 -62
sglang/srt/managers/configure_logging.py +2 -1
sglang/srt/managers/data_parallel_controller.py +6 -2
sglang/srt/managers/detokenizer_manager.py +124 -102
sglang/srt/managers/image_processor.py +2 -1
sglang/srt/managers/io_struct.py +143 -6
sglang/srt/managers/schedule_batch.py +237 -197
sglang/srt/managers/schedule_policy.py +29 -29
sglang/srt/managers/scheduler.py +681 -259
sglang/srt/managers/session_controller.py +6 -2
sglang/srt/managers/tokenizer_manager.py +224 -68
sglang/srt/managers/tp_worker.py +15 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/chunk_cache.py +18 -11
sglang/srt/mem_cache/hiradix_cache.py +394 -0
sglang/srt/mem_cache/memory_pool.py +44 -18
sglang/srt/mem_cache/radix_cache.py +58 -47
sglang/srt/metrics/collector.py +94 -36
sglang/srt/model_executor/cuda_graph_runner.py +55 -24
sglang/srt/model_executor/forward_batch_info.py +49 -16
sglang/srt/model_executor/model_runner.py +208 -28
sglang/srt/model_loader/loader.py +3 -3
sglang/srt/model_loader/weight_utils.py +36 -14
sglang/srt/models/baichuan.py +31 -6
sglang/srt/models/chatglm.py +39 -7
sglang/srt/models/commandr.py +29 -5
sglang/srt/models/dbrx.py +31 -5
sglang/srt/models/deepseek.py +43 -6
sglang/srt/models/deepseek_nextn.py +32 -19
sglang/srt/models/deepseek_v2.py +265 -32
sglang/srt/models/exaone.py +19 -9
sglang/srt/models/gemma.py +22 -8
sglang/srt/models/gemma2.py +25 -12
sglang/srt/models/gemma2_reward.py +5 -1
sglang/srt/models/gpt2.py +28 -13
sglang/srt/models/gpt_bigcode.py +27 -5
sglang/srt/models/granite.py +21 -9
sglang/srt/models/grok.py +21 -4
sglang/srt/models/internlm2.py +36 -6
sglang/srt/models/internlm2_reward.py +5 -1
sglang/srt/models/llama.py +26 -9
sglang/srt/models/llama_classification.py +5 -1
sglang/srt/models/llama_eagle.py +17 -4
sglang/srt/models/llama_embedding.py +5 -1
sglang/srt/models/llama_reward.py +7 -2
sglang/srt/models/llava.py +19 -3
sglang/srt/models/llavavid.py +10 -1
sglang/srt/models/minicpm.py +26 -2
sglang/srt/models/minicpm3.py +39 -3
sglang/srt/models/minicpmv.py +45 -14
sglang/srt/models/mixtral.py +20 -9
sglang/srt/models/mixtral_quant.py +50 -8
sglang/srt/models/mllama.py +57 -11
sglang/srt/models/olmo.py +34 -6
sglang/srt/models/olmo2.py +34 -13
sglang/srt/models/olmoe.py +26 -4
sglang/srt/models/phi3_small.py +29 -10
sglang/srt/models/qwen.py +26 -3
sglang/srt/models/qwen2.py +26 -4
sglang/srt/models/qwen2_5_vl.py +46 -8
sglang/srt/models/qwen2_eagle.py +17 -5
sglang/srt/models/qwen2_moe.py +44 -6
sglang/srt/models/qwen2_rm.py +78 -0
sglang/srt/models/qwen2_vl.py +39 -8
sglang/srt/models/stablelm.py +32 -5
sglang/srt/models/torch_native_llama.py +5 -2
sglang/srt/models/xverse.py +21 -9
sglang/srt/models/xverse_moe.py +45 -7
sglang/srt/models/yivl.py +2 -1
sglang/srt/openai_api/adapter.py +109 -24
sglang/srt/openai_api/protocol.py +17 -1
sglang/srt/reasoning_parser.py +154 -0
sglang/srt/sampling/penaltylib/__init__.py +4 -6
sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
sglang/srt/sampling/sampling_batch_info.py +79 -157
sglang/srt/sampling/sampling_params.py +16 -13
sglang/srt/server_args.py +136 -52
sglang/srt/speculative/build_eagle_tree.py +2 -8
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
sglang/srt/speculative/eagle_utils.py +92 -58
sglang/srt/speculative/eagle_worker.py +186 -94
sglang/srt/speculative/spec_info.py +1 -13
sglang/srt/utils.py +43 -17
sglang/srt/warmup.py +47 -0
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/runners.py +389 -126
sglang/test/send_one.py +88 -0
sglang/test/test_block_fp8_ep.py +361 -0
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +138 -84
sglang/utils.py +50 -60
sglang/version.py +1 -1
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +200 -166
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
sglang/bench_latency.py +0 -1
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
sglang/test/srt/sampling/penaltylib/utils.py +0 -344
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -9,9 +9,6 @@ import torch
 import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
-from sglang.srt.sampling.penaltylib.penalizers.repetition_penalty import (
-    apply_scaling_penalties,
-)
 logger = logging.getLogger(__name__)
@@ -22,49 +19,45 @@ if TYPE_CHECKING:
 @dataclasses.dataclass
 class SamplingBatchInfo:
-    # Batched sampling params
+    # Basic batched sampling params
     temperatures: torch.Tensor
     top_ps: torch.Tensor
     top_ks: torch.Tensor
     min_ps: torch.Tensor
-    # All requests use greedy sampling
+    # Whether all requests use greedy sampling
     is_all_greedy: bool
-    # Dispatch in CUDA graph
+    # Whether any request needs min_p sampling
     need_min_p_sampling: bool
-    # Whether any request has custom logit processor
-    has_custom_logit_processor: bool
-    # Bias Tensors
+    # Masking tensors for grammar-guided structured outputs
     vocab_size: int
     grammars: Optional[List] = None
-    sampling_info_done: Optional[threading.Event] = None
-    logit_bias: torch.Tensor = None
     vocab_mask: Optional[torch.Tensor] = None
-    apply_mask: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
+    apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
+    # An event used for overlap schedule
+    sampling_info_done: Optional[threading.Event] = None
     # Penalizer
     penalizer_orchestrator: Optional[penaltylib.BatchedPenalizerOrchestrator] = None
-    linear_penalties: Optional[torch.Tensor] = None
-    scaling_penalties: Optional[torch.Tensor] = None
+    linear_penalty: torch.Tensor = None
-    # Device
-    device: str = "cuda"
-    # Custom Parameters
+    # Whether any request has custom logit processor
+    has_custom_logit_processor: bool = False
+    # Custom parameters
     custom_params: Optional[List[Optional[Dict[str, Any]]]] = None
-    # Custom Logit Processor
+    # Custom logit processor
     custom_logit_processor: Optional[
         Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]
     ] = None
+    # Device
+    device: str = "cuda"
     @classmethod
-    def from_schedule_batch(
-        cls, batch: ScheduleBatch, vocab_size: int, enable_overlap_schedule: bool
-    ):
+    def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
         reqs = batch.reqs
         device = batch.device
         temperatures = (
@@ -118,106 +111,60 @@ class SamplingBatchInfo:
             merged_custom_logit_processor = None
             custom_params = None
-        ret = cls(
-            temperatures=temperatures,
-            top_ps=top_ps,
-            top_ks=top_ks,
-            min_ps=min_ps,
-            need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
-            is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
-            has_custom_logit_processor=has_custom_logit_processor,
-            vocab_size=vocab_size,
-            device=device,
-            custom_params=custom_params,
-            custom_logit_processor=merged_custom_logit_processor,
-        )
-        # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
-        if enable_overlap_schedule:
-            # TODO (lianmin): Some penalizers such as frequency and presence depend on model outputs,
-            # so it is kind of tricky to make it work with overlap scheduler.
-            # It requires correcly updating the penalty logits before the sampling and syncing the events.
-            # We will support them later.
-            penalizers = {
-                penaltylib.BatchedMinNewTokensPenalizer,
-            }
-            if (
-                any(req.sampling_params.frequency_penalty != 0.0 for req in reqs)
-                or any(req.sampling_params.presence_penalty != 0.0 for req in reqs)
-                or any(req.sampling_params.repetition_penalty != 1.0 for req in reqs)
-            ):
-                logger.warning(
-                    "frequency_penalty, presence_penalty, and repetition_penalty are not supported "
-                    "when using the default overlap scheduler. They will be ignored. "
-                    "Please add `--disable-overlap` when launching the server if you need these features. "
-                    "The speed will be slower in that case."
-                )
-        else:
-            penalizers = {
-                penaltylib.BatchedFrequencyPenalizer,
-                penaltylib.BatchedMinNewTokensPenalizer,
-                penaltylib.BatchedPresencePenalizer,
-                penaltylib.BatchedRepetitionPenalizer,
-            }
         # Each penalizers will do nothing if they evaluate themselves as not required by looking at
         # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
         # should not add hefty computation overhead other than simple checks.
         #
-        # While we choose not to even create the class instances if they are not required, this
+        # While we can choose not to even create the class instances if they are not required, this
         # could add additional complexity to the {ScheduleBatch} class, especially we need to
         # handle {filter_batch()} and {merge_batch()} cases as well.
-        ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
+        penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
             vocab_size=vocab_size,
             batch=batch,
-            device=batch.device,
-            Penalizers=penalizers,
+            penalizers={
+                penaltylib.BatchedFrequencyPenalizer,
+                penaltylib.BatchedMinNewTokensPenalizer,
+                penaltylib.BatchedPresencePenalizer,
+            },
         )
-        # Handle logit bias but only allocate when needed
-        ret.logit_bias = None
+        ret = cls(
+            temperatures=temperatures,
+            top_ps=top_ps,
+            top_ks=top_ks,
+            min_ps=min_ps,
+            is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
+            need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
+            vocab_size=vocab_size,
+            penalizer_orchestrator=penalizer_orchestrator,
+            has_custom_logit_processor=has_custom_logit_processor,
+            custom_params=custom_params,
+            custom_logit_processor=merged_custom_logit_processor,
+            device=device,
+        )
         return ret
     def __len__(self):
         return len(self.temperatures)
-    def update_penalties(self):
-        self.scaling_penalties = None
-        self.linear_penalties = None
-        for penalizer in self.penalizer_orchestrator.penalizers.values():
-            if not penalizer.is_prepared():
-                continue
-            if isinstance(penalizer, penaltylib.BatchedRepetitionPenalizer):
-                self.scaling_penalties = penalizer.cumulated_repetition_penalties
-            else:
-                if self.linear_penalties is None:
-                    bs = self.penalizer_orchestrator.batch.batch_size()
-                    self.linear_penalties = torch.zeros(
-                        (bs, self.vocab_size),
-                        dtype=torch.float32,
-                        device=self.device,
-                    )
-                self.linear_penalties = penalizer.apply(self.linear_penalties)
     def update_regex_vocab_mask(self):
         if not self.grammars:
             self.vocab_mask = None
-            self.apply_mask = None
+            self.apply_mask_func = None
             return
-        # find a grammar from the list
+        # Find a grammar from the list
         first_grammar = next(grammar for grammar in self.grammars if grammar)
-        # maybe we can reuse the existing mask?
+        # TODO(lianmin): Maybe we can reuse the existing mask?
         self.vocab_mask = first_grammar.allocate_vocab_mask(
             vocab_size=self.vocab_size,
             batch_size=len(self.temperatures),
             device=self.device,
         )
-        self.apply_mask = first_grammar.apply_vocab_mask  # force to use static method
+        self.apply_mask_func = (
+            first_grammar.apply_vocab_mask
+        )  # force to use static method
         # Apply the mask
         for i, grammar in enumerate(self.grammars):
@@ -227,35 +174,56 @@ class SamplingBatchInfo:
         # Move the mask to the device if needed
         self.vocab_mask = first_grammar.move_vocab_mask(self.vocab_mask, self.device)
-    def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
-        self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
+    def update_penalties(self):
+        if self.penalizer_orchestrator.is_required:
+            self.linear_penalty = torch.zeros(
+                (len(self.temperatures), self.vocab_size),
+                dtype=torch.float32,
+                device=self.temperatures.device,
+            )
+            self.penalizer_orchestrator.apply(self.linear_penalty)
+        else:
+            self.linear_penalty = None
+    def apply_logits_bias(self, logits: torch.Tensor):
+        if self.linear_penalty is not None:
+            # Used in the overlap mode
+            logits.add_(self.linear_penalty)
+        if self.penalizer_orchestrator and self.penalizer_orchestrator.is_required:
+            # Used in the non-overlap mode
+            self.penalizer_orchestrator.apply(logits)
+        if self.vocab_mask is not None:
+            self.apply_mask_func(logits=logits, vocab_mask=self.vocab_mask)
+    def filter_batch(self, keep_indices: List[int], keep_indices_device: torch.Tensor):
+        self.penalizer_orchestrator.filter(keep_indices_device)
         if self.has_custom_logit_processor:
-            self._filter_batch_custom_logit_processor(unfinished_indices, new_indices)
+            self._filter_batch_custom_logit_processor(keep_indices, keep_indices_device)
         for item in [
             "temperatures",
             "top_ps",
             "top_ks",
             "min_ps",
-            "logit_bias",
         ]:
             value = getattr(self, item, None)
-            if value is not None:  # logit_bias can be None
-                setattr(self, item, value[new_indices])
+            setattr(self, item, value[keep_indices_device])
     def _filter_batch_custom_logit_processor(
-        self, unfinished_indices: List[int], new_indices: torch.Tensor
+        self, keep_indices: List[int], keep_indices_device: torch.Tensor
     ):
         """Filter the custom logit processor and custom params"""
         self.custom_logit_processor = {
-            k: (p, mask[new_indices])
+            k: (p, mask[keep_indices_device])
             for k, (p, mask) in self.custom_logit_processor.items()
-            if any(
-                mask[new_indices]
+            if torch.any(
+                mask[keep_indices_device]
             )  # ignore the custom logit processor whose mask is all False
         }
-        self.custom_params = [self.custom_params[i] for i in unfinished_indices]
+        self.custom_params = [self.custom_params[i] for i in keep_indices]
         # If the custom logit processor is an empty dict, set the flag to False,
         # and set the custom logit processor and custom params to None.
@@ -264,31 +232,6 @@ class SamplingBatchInfo:
             self.custom_params = None
             self.has_custom_logit_processor = False
-    @staticmethod
-    def merge_bias_tensor(
-        lhs: torch.Tensor,
-        rhs: torch.Tensor,
-        bs1: int,
-        bs2: int,
-        device: str,
-        default: int = 0,
-    ):
-        # bias tensor can be None
-        if lhs is not None or rhs is not None:
-            shape, dtype = None, None
-            if lhs is not None:
-                shape, dtype = lhs.shape[1:], lhs.dtype
-            else:
-                shape, dtype = rhs.shape[1:], rhs.dtype
-            with torch.dtype(dtype):
-                if lhs is None:
-                    lhs = torch.empty((bs1, *shape), device=device).fill_(default)
-                if rhs is None:
-                    rhs = torch.empty((bs2, *shape), device=device).fill_(default)
-            return torch.cat([lhs, rhs])
-        return None
     @staticmethod
     def merge_custom_logit_processor(
         lhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
@@ -332,10 +275,6 @@ class SamplingBatchInfo:
     def merge_batch(self, other: "SamplingBatchInfo"):
         self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
-        # Merge the logit bias tensor
-        self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
-            self.logit_bias, other.logit_bias, len(self), len(other), self.device
-        )
         # Merge the custom logit processors and custom params lists
         if self.has_custom_logit_processor or other.has_custom_logit_processor:
             # Merge the custom logit processors
@@ -369,22 +308,5 @@ class SamplingBatchInfo:
             other_val = getattr(other, item, None)
             setattr(self, item, torch.concat([self_val, other_val]))
-        self.is_all_greedy = self.is_all_greedy and other.is_all_greedy
-        self.need_min_p_sampling = self.need_min_p_sampling or other.need_min_p_sampling
-    def apply_logits_bias(self, logits: torch.Tensor):
-        # Apply logit_bias
-        if self.logit_bias is not None:
-            logits.add_(self.logit_bias)
-        # min-token, presence, frequency
-        if self.linear_penalties is not None:
-            logits.add_(self.linear_penalties)
-        # repetition
-        if self.scaling_penalties is not None:
-            apply_scaling_penalties(logits, self.scaling_penalties)
-        # Apply regex vocab_mask
-        if self.vocab_mask is not None:
-            self.apply_mask(logits=logits, vocab_mask=self.vocab_mask)
+        self.is_all_greedy |= other.is_all_greedy
+        self.need_min_p_sampling |= other.need_min_p_sampling

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -22,8 +22,8 @@ class SamplingParams:
     """
     The sampling parameters.
-    See docs/references/sampling_params.md or
-    https://docs.sglang.ai/references/sampling_params.html
+    See docs/backend/sampling_params.md or
+    https://docs.sglang.ai/backend/sampling_params.html
     for the documentation.
     """
@@ -40,16 +40,23 @@ class SamplingParams:
         presence_penalty: float = 0.0,
         repetition_penalty: float = 1.0,
         min_new_tokens: int = 0,
-        spaces_between_special_tokens: bool = True,
         n: int = 1,
         json_schema: Optional[str] = None,
         regex: Optional[str] = None,
         ebnf: Optional[str] = None,
-        no_stop_trim: bool = False,
+        structural_tag: Optional[str] = None,
         ignore_eos: bool = False,
         skip_special_tokens: bool = True,
+        spaces_between_special_tokens: bool = True,
+        no_stop_trim: bool = False,
         custom_params: Optional[Dict[str, Any]] = None,
     ) -> None:
+        self.max_new_tokens = max_new_tokens
+        self.stop_strs = stop
+        if stop_token_ids:
+            self.stop_token_ids = set(stop_token_ids)
+        else:
+            self.stop_token_ids = None
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k
@@ -57,25 +64,21 @@ class SamplingParams:
         self.frequency_penalty = frequency_penalty
         self.presence_penalty = presence_penalty
         self.repetition_penalty = repetition_penalty
-        self.stop_strs = stop
-        if stop_token_ids:
-            self.stop_token_ids = set(stop_token_ids)
-        else:
-            self.stop_token_ids = None
-        self.max_new_tokens = max_new_tokens
         self.min_new_tokens = min_new_tokens
-        self.ignore_eos = ignore_eos
-        self.skip_special_tokens = skip_special_tokens
-        self.spaces_between_special_tokens = spaces_between_special_tokens
         self.regex = regex
         self.n = n
         self.json_schema = json_schema
         self.ebnf = ebnf
+        self.structural_tag = structural_tag
+        self.ignore_eos = ignore_eos
+        self.skip_special_tokens = skip_special_tokens
+        self.spaces_between_special_tokens = spaces_between_special_tokens
         self.no_stop_trim = no_stop_trim
         self.custom_params = custom_params
         # Process some special cases
         if self.temperature < _SAMPLING_EPS:
+            # top_k = 1 means greedy sampling
             self.temperature = 1.0
             self.top_k = 1
         if self.top_k == -1:

sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post3py3-none-any.whl