PyPI - sglang - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +73 -14
sglang/compile_deep_gemm.py +13 -7
sglang/launch_server.py +2 -0
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/compilation/backend.py +1 -1
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +30 -7
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -12
sglang/srt/entrypoints/engine.py +31 -20
sglang/srt/entrypoints/grpc_server.py +0 -1
sglang/srt/entrypoints/http_server.py +94 -94
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +23 -2
sglang/srt/eplb/expert_distribution.py +64 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/activation.py +6 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +19 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -1
sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
sglang/srt/layers/attention/utils.py +78 -0
sglang/srt/layers/communicator.py +24 -1
sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/layernorm.py +35 -6
sglang/srt/layers/logits_processor.py +9 -20
sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
sglang/srt/layers/moe/ep_moe/layer.py +78 -289
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +35 -10
sglang/srt/layers/moe/utils.py +3 -4
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +13 -84
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/awq.py +0 -3
sglang/srt/layers/quantization/base_config.py +7 -0
sglang/srt/layers/quantization/fp8.py +68 -63
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/quantization/mxfp4.py +30 -38
sglang/srt/layers/quantization/unquant.py +23 -45
sglang/srt/layers/quantization/w4afp8.py +38 -2
sglang/srt/layers/radix_attention.py +5 -2
sglang/srt/layers/rotary_embedding.py +130 -46
sglang/srt/layers/sampler.py +12 -1
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +29 -4
sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +185 -144
sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +165 -78
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/hicache_storage.py +7 -1
sglang/srt/mem_cache/memory_pool.py +253 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +55 -14
sglang/srt/model_executor/model_runner.py +77 -170
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +296 -78
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4_moe.py +322 -354
sglang/srt/models/glm4_moe_nextn.py +4 -14
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +29 -197
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/qwen2.py +23 -2
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +35 -5
sglang/srt/models/qwen3_moe.py +18 -12
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multimodal/processors/base_processor.py +1 -0
sglang/srt/multimodal/processors/glm4v.py +1 -1
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/parser/reasoning_parser.py +28 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +459 -199
sglang/srt/single_batch_overlap.py +2 -4
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +142 -74
sglang/srt/utils/hf_transformers_utils.py +38 -12
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_deterministic.py +235 -12
sglang/test/test_deterministic_utils.py +2 -1
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
sglang/srt/models/vila.py +0 -306
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/multiplex/pdmux_context.py ADDED Viewed

@@ -0,0 +1,164 @@
+from dataclasses import dataclass, field
+from typing import List
+import torch
+import yaml
+STREAM_GROUPS = []
+SM_COUNTS = []
+SM_GROUP_NUM = 8  # Default number of SM groups
+CURRENT_STREAM_IDX = 0
+CURRENT_STREAM_GROUP = None
+@dataclass
+class PDMuxConfig:
+    sm_group_num: int = 8
+    manual_divisions: List[List[int]] = field(
+        default_factory=list
+    )  # [prefill_sm, decode_sm, decode_bs_threshold]
+    split_forward_token_budget: int = 65536
+    decode_bs_divisor: int = 36
+def load_pdmux_config(config_path: str) -> PDMuxConfig:
+    """Load pdmux configuration from YAML file into a dataclass."""
+    if not config_path:
+        return PDMuxConfig()
+    with open(config_path, "r") as f:
+        raw = yaml.safe_load(f)
+    if "sm_group_num" not in raw:
+        raise ValueError("Missing required field: sm_group_num")
+    if raw["sm_group_num"] < 3:
+        raise ValueError("sm_group_num must greater than 3")
+    manual_divisions = raw.get("manual_divisions", [])
+    expected = raw["sm_group_num"] - 2
+    if manual_divisions and len(manual_divisions) != expected:
+        raise ValueError(
+            f"manual_divisions must have {expected} entries, "
+            f"but got {len(manual_divisions)}"
+        )
+    return PDMuxConfig(
+        sm_group_num=raw["sm_group_num"],
+        manual_divisions=manual_divisions,
+        split_forward_token_budget=raw.get("split_forward_token_budget", 65536),
+        decode_bs_divisor=raw.get("decode_bs_divisor", 36),
+    )
+def get_arch_constraints(compute_capability):
+    major, minor = compute_capability
+    # green context constraints for different architectures
+    if major == 6:
+        return 1, 1  # min_per_part, multiple
+    elif major == 7:
+        return 2, 2
+    elif major == 8:
+        return 4, 2
+    elif major == 9 and minor >= 0:
+        return 8, 8
+    else:
+        raise ValueError(f"Unsupported compute capability: {major}.{minor}")
+def divide_sm(total_sms, compute_capability, groups):
+    """
+    :param total_sms: total sm count on a single GPU
+    :param compute_capability: (major, minor)
+    :return: SM partition group(prefill sm, decode sm)
+    """
+    min_per_part, multiple = get_arch_constraints(compute_capability)
+    possible_values = [
+        x
+        for x in range(min_per_part, total_sms - min_per_part + 1, multiple)
+        if x >= total_sms - x and total_sms - x >= 16
+    ]
+    if not possible_values:
+        raise ValueError(
+            f"No valid partitions found for total SMs {total_sms} "
+            f"with constraints (min per part: {min_per_part}, multiple: {multiple})"
+        )
+    if len(possible_values) >= groups:
+        step = max(1, len(possible_values) // groups)
+        selected_values = possible_values[::step][:groups]
+    else:
+        selected_values = possible_values
+    divisions = []
+    for part1 in selected_values:
+        part2 = total_sms - part1
+        divisions.append((part1, part2))
+    divisions.reverse()  # Reverse to have larger prefill SM first
+    return divisions
+def initialize_stream_groups(gpu_id: int, config: PDMuxConfig):
+    from sgl_kernel import spatial
+    global STREAM_GROUPS, SM_COUNTS, SM_GROUP_NUM, CURRENT_STREAM_IDX, CURRENT_STREAM_GROUP
+    # for pd_multiplexing, Init stream_groups
+    device = torch.cuda.current_device()
+    total_sm_count = spatial.get_sm_available(gpu_id)
+    # (prefill_sm_count, decode_sm_count)
+    if config.manual_divisions:
+        divisions = [
+            (prefill_sm, decode_sm)
+            for prefill_sm, decode_sm, _ in config.manual_divisions
+        ]
+    else:
+        divisions = divide_sm(
+            total_sm_count,
+            torch.cuda.get_device_capability(device),
+            config.sm_group_num - 2,
+        )
+    SM_COUNTS = []
+    SM_COUNTS.append((total_sm_count, 0))  # Normal stream for prefill
+    SM_COUNTS.extend(divisions)  # Add the divided SM counts
+    SM_COUNTS.append((0, total_sm_count))  # Normal stream for decode
+    STREAM_GROUPS = []
+    STREAM_GROUPS.append(
+        (torch.cuda.Stream(gpu_id), torch.cuda.Stream(gpu_id))
+    )  # Normal stream for prefill
+    for prefill_sm, decode_sm in divisions:
+        STREAM_GROUPS.append(
+            (spatial.create_greenctx_stream_by_value(prefill_sm, decode_sm, gpu_id))
+        )
+    STREAM_GROUPS.append(
+        (torch.cuda.Stream(gpu_id), torch.cuda.Stream(gpu_id))
+    )  # Normal stream for decode
+    CURRENT_STREAM_IDX = 0
+    CURRENT_STREAM_GROUP = STREAM_GROUPS[CURRENT_STREAM_IDX]
+def set_current_stream_idx(idx: int):
+    global CURRENT_STREAM_IDX, CURRENT_STREAM_GROUP
+    if idx < 0 or idx >= len(STREAM_GROUPS):
+        raise ValueError(f"Invalid stream index: {idx}")
+    CURRENT_STREAM_IDX = idx
+    CURRENT_STREAM_GROUP = STREAM_GROUPS[CURRENT_STREAM_IDX]
+def get_stream_groups() -> list[tuple[torch.cuda.Stream, torch.cuda.Stream]]:
+    """Get the stream groups."""
+    return STREAM_GROUPS
+def get_sm_counts() -> list[tuple[int, int]]:
+    """Get the SM counts."""
+    return SM_COUNTS
+def get_current_stream_idx() -> int:
+    """Get the current stream index."""
+    return CURRENT_STREAM_IDX

sglang/srt/parser/conversation.py CHANGED Viewed

@@ -101,6 +101,7 @@ class Conversation:
     stop_token_ids: Optional[int] = None
     audio_data: Optional[List[str]] = None
+    image_token_at_prefix: bool = False
     def get_prompt(self) -> str:
         """Get the prompt for generation."""
@@ -445,6 +446,7 @@ class Conversation:
             image_token=self.image_token,
             video_token=self.video_token,
             audio_token=self.audio_token,
+            image_token_at_prefix=self.image_token_at_prefix,
         )
     def dict(self):
@@ -512,6 +514,7 @@ def generate_embedding_convs(
             image_token=conv_template.image_token,
             video_token=conv_template.video_token,
             audio_token=conv_template.audio_token,
+            image_token_at_prefix=conv_template.image_token_at_prefix,
         )
         real_content = ""
@@ -578,6 +581,7 @@ def generate_chat_conv(
         image_token=conv.image_token,
         audio_token=conv.audio_token,
         video_token=conv.video_token,
+        image_token_at_prefix=conv.image_token_at_prefix,
     )
     if isinstance(request.messages, str):
@@ -627,7 +631,7 @@ def generate_chat_conv(
                         real_content += content.text
                     elif content.type == "image_url":
                         # NOTE: works for llava and intervl2_5
-                        if conv.name in ["internvl-2-5"]:
+                        if conv.image_token_at_prefix:
                             real_content = image_token + real_content
                         else:
                             real_content += image_token
@@ -820,6 +824,7 @@ register_conv_template(
         sep="<|im_end|>\n",
         stop_str=["<|im_end|>", "<|action_end|>"],
         image_token="<IMG_CONTEXT>",
+        image_token_at_prefix=True,
     )
 )
@@ -848,6 +853,7 @@ register_conv_template(
         sep_style=SeparatorStyle.NO_COLON_SINGLE,
         stop_str=["<｜end▁of▁sentence｜>"],
         image_token="<image>",
+        image_token_at_prefix=True,
     )
 )

sglang/srt/parser/reasoning_parser.py CHANGED Viewed

@@ -249,6 +249,31 @@ class GptOssDetector(BaseReasoningFormatDetector):
         )
+class MiniMaxAppendThinkDetector(BaseReasoningFormatDetector):
+    """
+    Append `<think>` token to the beginning of the text.
+    """
+    def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
+        # scheduler.py need `reasoning_parser.detector.think_end_token`
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+        )
+        self.is_first_chunk = False
+    def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
+        if not self.is_first_chunk:
+            self.is_first_chunk = True
+            new_text = self.think_start_token + new_text
+        return StreamingParseResult(normal_text=new_text)
+    def detect_and_parse(self, text: str) -> StreamingParseResult:
+        return StreamingParseResult(normal_text=self.think_start_token + text)
 class ReasoningParser:
     """
     Parser that handles both streaming and non-streaming scenarios for extracting
@@ -268,6 +293,8 @@ class ReasoningParser:
         "kimi": KimiDetector,
         "qwen3": Qwen3Detector,
         "qwen3-thinking": Qwen3Detector,
+        "minimax": Qwen3Detector,
+        "minimax-append-think": MiniMaxAppendThinkDetector,
         "step3": DeepSeekR1Detector,
     }
@@ -285,7 +312,7 @@ class ReasoningParser:
             raise ValueError(f"Unsupported model type: {model_type}")
         # Special cases where we override force_reasoning
-        if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
+        if model_type.lower() in {"qwen3-thinking", "gpt-oss", "minimax"}:
             force_reasoning = True
         # Only pass force_reasoning if explicitly set, let detectors use their defaults

sglang/srt/sampling/custom_logit_processor.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 from abc import ABC, abstractmethod
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
 import dill
 import orjson
@@ -126,3 +126,69 @@ class DeepSeekR1ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
     THINKING_START_TOKEN_ID: int = 128798
     THINKING_END_TOKEN_ID: int = 128799
     NEW_LINE_TOKEN_ID: int = 201
+# Adapted from DeepSeek's implementation: https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/ngram_norepeat.py
+class DeepseekOCRNoRepeatNGramLogitProcessor(CustomLogitProcessor):
+    """Block n-gram repetitions within a sliding window for DeepSeek-OCR outputs."""
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        if not custom_param_list:
+            return logits
+        for batch_idx, params in enumerate(custom_param_list):
+            if not params:
+                continue
+            req = params.get("__req__")
+            if req is None:
+                continue
+            try:
+                ngram_size = int(params.get("ngram_size") or 0)
+                window_size = int(params.get("window_size") or 0)
+            except (TypeError, ValueError):
+                continue
+            if ngram_size <= 0 or window_size <= 0:
+                continue
+            sequence: List[int] = req.origin_input_ids + req.output_ids
+            if len(sequence) < ngram_size:
+                continue
+            search_start = max(0, len(sequence) - window_size)
+            search_end = len(sequence) - ngram_size + 1
+            if search_end <= search_start:
+                continue
+            if ngram_size > 1:
+                current_prefix = tuple(sequence[-(ngram_size - 1) :])
+            else:
+                current_prefix = tuple()
+            banned_tokens: Set[int] = set()
+            for idx in range(search_start, search_end):
+                ngram = sequence[idx : idx + ngram_size]
+                if ngram_size == 1 or tuple(ngram[:-1]) == current_prefix:
+                    banned_tokens.add(ngram[-1])
+            whitelist_ids = params.get("whitelist_token_ids") or []
+            try:
+                whitelist = {int(token_id) for token_id in whitelist_ids}
+            except (TypeError, ValueError):
+                whitelist = set()
+            banned_tokens.difference_update(whitelist)
+            if not banned_tokens:
+                continue
+            indices = list(banned_tokens)
+            logits[batch_idx, indices] = -float("inf")
+        return logits

sglang/srt/sampling/penaltylib/frequency_penalty.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import torch
-from sglang.srt.sampling.penaltylib.orchestrator import (
-    BatchedPenalizerOrchestrator,
-    _BatchedPenalizer,
-)
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer
 class BatchedFrequencyPenalizer(_BatchedPenalizer):
@@ -11,10 +8,6 @@ class BatchedFrequencyPenalizer(_BatchedPenalizer):
     Frequency penalizer penalizes tokens based on their frequency in the output.
     """
-    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
-        self.orchestrator = orchestrator
-        self._is_prepared = False
     def _is_required(self) -> bool:
         return any(
             req.sampling_params.frequency_penalty != 0.0
@@ -63,3 +56,8 @@ class BatchedFrequencyPenalizer(_BatchedPenalizer):
             [self.cumulated_frequency_penalties, their.cumulated_frequency_penalties],
             dim=0,
         )
+    def _teardown(self) -> None:
+        for name in ("frequency_penalties", "cumulated_frequency_penalties"):
+            if hasattr(self, name):
+                delattr(self, name)

sglang/srt/sampling/penaltylib/min_new_tokens.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import torch
-from sglang.srt.sampling.penaltylib.orchestrator import (
-    BatchedPenalizerOrchestrator,
-    _BatchedPenalizer,
-)
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer
 class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
@@ -11,10 +8,6 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
     Min new tokens penalizer penalizes tokens based on the length of the output.
     """
-    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
-        self.orchestrator = orchestrator
-        self._is_prepared = False
     def _is_required(self) -> bool:
         return any(
             req.sampling_params.min_new_tokens > 0 for req in self.orchestrator.reqs()
@@ -92,3 +85,9 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
         self.len_output_tokens = torch.cat(
             [self.len_output_tokens, their.len_output_tokens], dim=0
         )
+    # Explicit resource cleanup to aid GC and free CUDA memory promptly
+    def _teardown(self) -> None:
+        for name in ("min_new_tokens", "stop_token_penalties", "len_output_tokens"):
+            if hasattr(self, name):
+                delattr(self, name)

sglang/srt/sampling/penaltylib/orchestrator.py CHANGED Viewed

@@ -77,9 +77,8 @@ class BatchedPenalizerOrchestrator:
             return
         if len(keep_indices) == 0:
-            self.is_required = False
-            for penalizer in self.penalizers.values():
-                penalizer.teardown()
+            # No requests left in the batch, fully release orchestrator resources
+            self.release()
             return
         is_required = False
@@ -92,6 +91,23 @@ class BatchedPenalizerOrchestrator:
                 penalizer.teardown()
         self.is_required = is_required
+    # Resource management helpers
+    def release(self) -> None:
+        """Release all penalizers and break references so GC can reclaim promptly."""
+        for penalizer in self.penalizers.values():
+            penalizer.teardown()
+        self.penalizers.clear()
+        # Break reference to ScheduleBatch
+        self._batch_ref = None
+        self.is_required = False
+    # Context manager support
+    def __enter__(self) -> "BatchedPenalizerOrchestrator":
+        return self
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self.release()
     def merge(self, their: "BatchedPenalizerOrchestrator"):
         """
         Merge the penalizers of another orchestrator into this one.
@@ -116,6 +132,22 @@ class _BatchedPenalizer(abc.ABC):
     An abstract class for a batched penalizer.
     """
+    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
+        self._orchestrator_ref: weakref.ReferenceType[BatchedPenalizerOrchestrator] = (
+            weakref.ref(orchestrator)
+        )
+        self._is_prepared = False
+    @property
+    def orchestrator(self) -> BatchedPenalizerOrchestrator:
+        orch: Optional[BatchedPenalizerOrchestrator] = self._orchestrator_ref()
+        # This should never happen, but we need to handle it gracefully
+        if orch is None:
+            raise RuntimeError(
+                "BatchedPenalizerOrchestrator has been garbage-collected"
+            )
+        return orch
     def is_prepared(self) -> bool:
         return self._is_prepared
@@ -135,6 +167,7 @@ class _BatchedPenalizer(abc.ABC):
             return False
     def teardown(self):
+        self._teardown()
         self._is_prepared = False
     def cumulate_output_tokens(self, output_ids: torch.Tensor):
@@ -207,3 +240,10 @@ class _BatchedPenalizer(abc.ABC):
         Merge the penalizer with another penalizer.
         """
         pass
+    @abc.abstractmethod
+    def _teardown(self):
+        """
+        Teardown the penalizer.
+        """
+        pass

sglang/srt/sampling/penaltylib/presence_penalty.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import torch
-from sglang.srt.sampling.penaltylib.orchestrator import (
-    BatchedPenalizerOrchestrator,
-    _BatchedPenalizer,
-)
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer
 class BatchedPresencePenalizer(_BatchedPenalizer):
@@ -11,10 +8,6 @@ class BatchedPresencePenalizer(_BatchedPenalizer):
     Presence penalizer penalizes tokens based on their presence in the output.
     """
-    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
-        self.orchestrator = orchestrator
-        self._is_prepared = False
     def _is_required(self) -> bool:
         return any(
             req.sampling_params.presence_penalty != 0.0
@@ -63,3 +56,8 @@ class BatchedPresencePenalizer(_BatchedPenalizer):
             [self.cumulated_presence_penalties, their.cumulated_presence_penalties],
             dim=0,
         )
+    def _teardown(self) -> None:
+        for name in ("presence_penalties", "cumulated_presence_penalties"):
+            if hasattr(self, name):
+                delattr(self, name)

sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl