PyPI - sglang - Versions diffs - 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +41 -27
sglang/bench_one_batch.py +60 -4
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +83 -71
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +46 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +452 -0
sglang/srt/entrypoints/http_server.py +603 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +8 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -9
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +71 -0
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +65 -14
sglang/srt/layers/logits_processor.py +49 -64
sglang/srt/layers/moe/ep_moe/layer.py +24 -16
sglang/srt/layers/moe/fused_moe_native.py +84 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
sglang/srt/layers/parameter.py +18 -8
sglang/srt/layers/quantization/__init__.py +20 -23
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +10 -4
sglang/srt/layers/quantization/modelopt_quant.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +1 -1
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/layers/rotary_embedding.py +1184 -31
sglang/srt/layers/sampler.py +64 -6
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/layers/vocab_parallel_embedding.py +2 -2
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +3 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +24 -6
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +57 -3
sglang/srt/managers/schedule_batch.py +78 -45
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +326 -201
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +210 -121
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +10 -32
sglang/srt/metrics/collector.py +15 -6
sglang/srt/model_executor/cuda_graph_runner.py +26 -30
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +44 -19
sglang/srt/model_loader/loader.py +83 -6
sglang/srt/model_loader/weight_utils.py +145 -6
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +17 -5
sglang/srt/models/dbrx.py +13 -5
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +11 -11
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +15 -25
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +4 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +7 -5
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +9 -9
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +41 -4
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +20 -7
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +7 -4
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +143 -18
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +4 -1090
sglang/srt/server_args.py +77 -15
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +164 -129
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +2 -1
sglang/test/test_utils.py +83 -22
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0

sglang/srt/models/xverse_moe.py CHANGED Viewed

@@ -18,25 +18,25 @@ from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
     QKVParallelLinear,
     ReplicatedLinear,
     RowParallelLinear,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -20,7 +20,7 @@ import os
 import time
 import uuid
 from http import HTTPStatus
-from typing import Dict, List
+from typing import Dict, List, Optional
 from fastapi import HTTPException, Request, UploadFile
 from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -40,6 +40,7 @@ from sglang.srt.conversation import (
     generate_chat_conv,
     register_conv_template,
 )
+from sglang.srt.function_call_parser import TOOLS_TAG_LIST, FunctionCallParser
 from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
 from sglang.srt.openai_api.protocol import (
     BatchRequest,
@@ -71,7 +72,6 @@ from sglang.srt.openai_api.protocol import (
     TopLogprob,
     UsageInfo,
 )
-from sglang.srt.utils import TOOLS_TAG_LIST, parse_tool_response
 from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -309,6 +309,7 @@ async def process_batch(tokenizer_manager, batch_id: str, batch_request: BatchRe
                     ret,
                     to_file=True,
                     cache_report=tokenizer_manager.server_args.enable_cache_report,
+                    tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
                 )
             else:
                 responses = v1_generate_response(
@@ -877,9 +878,6 @@ def v1_chat_generate_request(
             tools = None
             if request.tools and request.tool_choice != "none":
                 request.skip_special_tokens = False
-                if request.stream:
-                    logger.warning("Streaming is not supported with tools.")
-                    request.stream = False
                 if not isinstance(request.tool_choice, str):
                     tools = [
                         item.function.model_dump()
@@ -908,12 +906,26 @@ def v1_chat_generate_request(
                     openai_compatible_messages = openai_compatible_messages[:-1]
                 else:
                     assistant_prefix = None
-                prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
-                    openai_compatible_messages,
-                    tokenize=True,
-                    add_generation_prompt=True,
-                    tools=tools,
-                )
+                try:
+                    prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
+                        openai_compatible_messages,
+                        tokenize=True,
+                        add_generation_prompt=True,
+                        tools=tools,
+                    )
+                except:
+                    #  This except branch will be triggered when the chosen model
+                    #  has a different tools input format that is not compatiable
+                    #  with openAI's apply_chat_template tool_call format, like Mistral.
+                    tools = [t if "function" in t else {"function": t} for t in tools]
+                    prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
+                        openai_compatible_messages,
+                        tokenize=True,
+                        add_generation_prompt=True,
+                        tools=tools,
+                    )
                 if assistant_prefix:
                     prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
                 stop = request.stop
@@ -1005,7 +1017,9 @@ def v1_chat_generate_request(
     return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
-def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
+def v1_chat_generate_response(
+    request, ret, to_file=False, cache_report=False, tool_call_parser=None
+):
     choices = []
     for idx, ret_item in enumerate(ret):
@@ -1066,12 +1080,13 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
             if finish_reason == "stop":
                 finish_reason = "tool_calls"
             try:
-                text, call_info_list = parse_tool_response(text, tools)  # noqa
+                parser = FunctionCallParser(tools, tool_call_parser)
+                full_normal_text, call_info_list = parser.parse_non_stream(text)
                 tool_calls = [
                     ToolCall(
-                        id=str(call_info[0]),
+                        id=str(call_info.tool_index),
                         function=FunctionResponse(
-                            name=call_info[1], arguments=call_info[2]
+                            name=call_info.name, arguments=call_info.parameters
                         ),
                     )
                     for call_info in call_info_list
@@ -1172,6 +1187,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
     adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
     if adapted_request.stream:
+        parser_dict = {}
         async def generate_stream_resp():
             is_firsts = {}
@@ -1184,6 +1200,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                     adapted_request, raw_request
                 ):
                     index = content.get("index", 0)
+                    text = content["text"]
                     is_first = is_firsts.get(index, True)
                     stream_buffer = stream_buffers.get(index, "")
@@ -1263,29 +1280,111 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
                     text = content["text"]
                     delta = text[len(stream_buffer) :]
-                    stream_buffer = stream_buffer + delta
-                    choice_data = ChatCompletionResponseStreamChoice(
-                        index=index,
-                        delta=DeltaMessage(content=delta),
-                        finish_reason=(finish_reason["type"] if finish_reason else ""),
-                        matched_stop=(
-                            finish_reason["matched"]
-                            if finish_reason and "matched" in finish_reason
-                            else None
-                        ),
-                        logprobs=choice_logprobs,
-                    )
-                    chunk = ChatCompletionStreamResponse(
-                        id=content["meta_info"]["id"],
-                        choices=[choice_data],
-                        model=request.model,
-                    )
+                    new_stream_buffer = stream_buffer + delta
-                    is_firsts[index] = is_first
-                    stream_buffers[index] = stream_buffer
-                    n_prev_tokens[index] = n_prev_token
+                    if request.tool_choice != "none" and request.tools:
+                        if index not in parser_dict:
+                            parser_dict[index] = FunctionCallParser(
+                                tools=request.tools,
+                                tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
+                            )
+                        parser = parser_dict[index]
+                        # parse_increment => returns (normal_text, calls)
+                        normal_text, calls = parser.parse_stream_chunk(delta)
+                        # 1) if there's normal_text, output it as normal content
+                        if normal_text:
+                            choice_data = ChatCompletionResponseStreamChoice(
+                                index=index,
+                                delta=DeltaMessage(content=normal_text),
+                                finish_reason=(
+                                    finish_reason["type"] if finish_reason else ""
+                                ),
+                            )
+                            chunk = ChatCompletionStreamResponse(
+                                id=content["meta_info"]["id"],
+                                choices=[choice_data],
+                                model=request.model,
+                            )
+                            yield f"data: {chunk.model_dump_json()}\n\n"
+                        # 2) if we found calls, we output them as separate chunk(s)
+                        for call_item in calls:
+                            # transform call_item -> FunctionResponse + ToolCall
+                            if (
+                                content["meta_info"]["finish_reason"]
+                                and content["meta_info"]["finish_reason"]["type"]
+                                == "stop"
+                            ):
+                                latest_delta_len = 0
+                                if isinstance(call_item.parameters, str):
+                                    latest_delta_len = len(call_item.parameters)
+                                expected_call = json.dumps(
+                                    parser.multi_format_parser.detectors[0]
+                                    .prev_tool_call_arr[index]
+                                    .get("arguments", {}),
+                                    ensure_ascii=False,
+                                )
+                                actual_call = parser.multi_format_parser.detectors[
+                                    0
+                                ].streamed_args_for_tool[index]
+                                if latest_delta_len > 0:
+                                    actual_call = actual_call[:-latest_delta_len]
+                                remaining_call = expected_call.replace(
+                                    actual_call, "", 1
+                                )
+                                call_item.parameters = remaining_call
+                            tool_call = ToolCall(
+                                id=str(call_item.tool_index),
+                                function=FunctionResponse(
+                                    name=call_item.name,
+                                    arguments=call_item.parameters,
+                                ),
+                            )
+                            choice_data = ChatCompletionResponseStreamChoice(
+                                index=index,
+                                delta=DeltaMessage(
+                                    role="assistant", tool_calls=[tool_call]
+                                ),
+                                finish_reason="tool_call",
+                            )
+                            chunk = ChatCompletionStreamResponse(
+                                id=content["meta_info"]["id"],
+                                choices=[choice_data],
+                                model=request.model,
+                            )
+                            yield f"data: {chunk.model_dump_json()}\n\n"
-                    yield f"data: {chunk.model_dump_json()}\n\n"
+                        stream_buffers[index] = new_stream_buffer
+                        is_firsts[index] = is_first
+                    else:
+                        # No tool calls => just treat this as normal text
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=index,
+                            delta=DeltaMessage(content=delta),
+                            finish_reason=(
+                                finish_reason["type"] if finish_reason else ""
+                            ),
+                            matched_stop=(
+                                finish_reason["matched"]
+                                if finish_reason and "matched" in finish_reason
+                                else None
+                            ),
+                            logprobs=choice_logprobs,
+                        )
+                        chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            choices=[choice_data],
+                            model=request.model,
+                        )
+                        yield f"data: {chunk.model_dump_json()}\n\n"
+                        stream_buffers[index] = new_stream_buffer
+                        is_firsts[index] = is_first
                 if request.stream_options and request.stream_options.include_usage:
                     total_prompt_tokens = sum(
                         tokens
@@ -1333,7 +1432,10 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
         ret = [ret]
     response = v1_chat_generate_response(
-        request, ret, cache_report=tokenizer_manager.server_args.enable_cache_report
+        request,
+        ret,
+        cache_report=tokenizer_manager.server_args.enable_cache_report,
+        tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
     )
     return response

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -180,6 +180,7 @@ class CompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
 class CompletionResponseChoice(BaseModel):
@@ -261,7 +262,7 @@ class Function(BaseModel):
     """Function descriptions."""
     description: Optional[str] = Field(default=None, examples=[None])
-    name: str
+    name: Optional[str] = None
     parameters: Optional[object] = None
@@ -275,7 +276,7 @@ class Tool(BaseModel):
 class ToolChoiceFuncName(BaseModel):
     """The name of tool choice function."""
-    name: str
+    name: Optional[str] = None
 class ToolChoice(BaseModel):
@@ -322,13 +323,14 @@ class ChatCompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
 class FunctionResponse(BaseModel):
     """Function response."""
-    name: str
-    arguments: str
+    name: Optional[str] = None
+    arguments: Optional[str] = None
 class ToolCall(BaseModel):
@@ -365,6 +367,7 @@ class ChatCompletionResponse(BaseModel):
 class DeltaMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
 class ChatCompletionResponseStreamChoice(BaseModel):

sglang/srt/sampling/custom_logit_processor.py ADDED Viewed

@@ -0,0 +1,38 @@
+import json
+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import Any, Dict, List, Optional
+import dill
+import torch
+@lru_cache(maxsize=None)
+def _cache_from_str(json_str: str):
+    """Deserialize a json string to a Callable object.
+    This function is cached to avoid redundant deserialization.
+    """
+    data = json.loads(json_str)
+    return dill.loads(bytes.fromhex(data["callable"]))
+class CustomLogitProcessor(ABC):
+    """Abstract base class for callable functions."""
+    @abstractmethod
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        """Define the callable behavior."""
+        raise NotImplementedError
+    def to_str(self) -> str:
+        """Serialize the callable function to a JSON-compatible string."""
+        return json.dumps({"callable": dill.dumps(self).hex()})
+    @classmethod
+    def from_str(cls, json_str: str):
+        """Deserialize a callable function from a JSON string."""
+        return _cache_from_str(json_str)

sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py CHANGED Viewed

@@ -3,11 +3,16 @@ from typing import List
 import torch
 from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import get_compiler_backend
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import sampling_scaling_penalties
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def apply_scaling_penalties(logits, scaling_penalties):
+    logits[:] = torch.where(
+        logits > 0,
+        logits / scaling_penalties,
+        logits * scaling_penalties,
+    )
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -61,16 +66,8 @@ class BatchedRepetitionPenalizer(_BatchedPenalizer):
         self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
     def _apply(self, logits: torch.Tensor) -> torch.Tensor:
-        if is_cuda:
-            return sampling_scaling_penalties(
-                logits, self.cumulated_repetition_penalties
-            )
-        else:
-            return torch.where(
-                logits > 0,
-                logits / self.cumulated_repetition_penalties,
-                logits * self.cumulated_repetition_penalties,
-            )
+        apply_scaling_penalties(logits, self.cumulated_repetition_penalties)
+        return logits
     def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -3,17 +3,15 @@ from __future__ import annotations
 import dataclasses
 import logging
 import threading
-from typing import TYPE_CHECKING, Callable, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
 import torch
-from sglang.srt.utils import is_cuda_available
-is_cuda = is_cuda_available()
-if is_cuda:
-    from sgl_kernel import sampling_scaling_penalties
 import sglang.srt.sampling.penaltylib as penaltylib
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.sampling.penaltylib.penalizers.repetition_penalty import (
+    apply_scaling_penalties,
+)
 logger = logging.getLogger(__name__)
@@ -36,6 +34,9 @@ class SamplingBatchInfo:
     # Dispatch in CUDA graph
     need_min_p_sampling: bool
+    # Whether any request has custom logit processor
+    has_custom_logit_processor: bool
     # Bias Tensors
     vocab_size: int
     grammars: Optional[List] = None
@@ -52,6 +53,14 @@ class SamplingBatchInfo:
     # Device
     device: str = "cuda"
+    # Custom Parameters
+    custom_params: Optional[List[Optional[Dict[str, Any]]]] = None
+    # Custom Logit Processor
+    custom_logit_processor: Optional[
+        Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]
+    ] = None
     @classmethod
     def from_schedule_batch(
         cls, batch: ScheduleBatch, vocab_size: int, enable_overlap_schedule: bool
@@ -76,6 +85,39 @@ class SamplingBatchInfo:
             [r.sampling_params.min_p for r in reqs], dtype=torch.float
         ).to(device, non_blocking=True)
+        # Check if any request has custom logit processor
+        has_custom_logit_processor = (
+            batch.enable_custom_logit_processor  # check the flag first.
+            and any(r.custom_logit_processor for r in reqs)  # then check the requests.
+        )
+        if has_custom_logit_processor:
+            # Merge the same type of custom logit processors together
+            processor_dict = {}
+            for i, r in enumerate(reqs):
+                if r.custom_logit_processor is None:
+                    continue
+                processor_str = r.custom_logit_processor
+                if processor_str not in processor_dict:
+                    processor_dict[processor_str] = []
+                processor_dict[processor_str].append(i)
+            merged_custom_logit_processor = {
+                hash(processor_str): (
+                    # The deserialized custom logit processor object
+                    CustomLogitProcessor.from_str(processor_str),
+                    # The mask tensor for the requests that use this custom logit processor
+                    torch.zeros(len(reqs), dtype=torch.bool)
+                    .scatter_(0, torch.tensor(true_indices), True)
+                    .to(device, non_blocking=True),
+                )
+                for processor_str, true_indices in processor_dict.items()
+            }
+            custom_params = [r.sampling_params.custom_params for r in reqs]
+        else:
+            merged_custom_logit_processor = None
+            custom_params = None
         ret = cls(
             temperatures=temperatures,
             top_ps=top_ps,
@@ -83,8 +125,11 @@ class SamplingBatchInfo:
             min_ps=min_ps,
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
+            has_custom_logit_processor=has_custom_logit_processor,
             vocab_size=vocab_size,
             device=device,
+            custom_params=custom_params,
+            custom_logit_processor=merged_custom_logit_processor,
         )
         # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
@@ -184,6 +229,8 @@ class SamplingBatchInfo:
     def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
         self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
+        if self.has_custom_logit_processor:
+            self._filter_batch_custom_logit_processor(unfinished_indices, new_indices)
         for item in [
             "temperatures",
@@ -196,6 +243,27 @@ class SamplingBatchInfo:
             if value is not None:  # logit_bias can be None
                 setattr(self, item, value[new_indices])
+    def _filter_batch_custom_logit_processor(
+        self, unfinished_indices: List[int], new_indices: torch.Tensor
+    ):
+        """Filter the custom logit processor and custom params"""
+        self.custom_logit_processor = {
+            k: (p, mask[new_indices])
+            for k, (p, mask) in self.custom_logit_processor.items()
+            if any(
+                mask[new_indices]
+            )  # ignore the custom logit processor whose mask is all False
+        }
+        self.custom_params = [self.custom_params[i] for i in unfinished_indices]
+        # If the custom logit processor is an empty dict, set the flag to False,
+        # and set the custom logit processor and custom params to None.
+        if len(self.custom_logit_processor) == 0:
+            self.custom_logit_processor = None
+            self.custom_params = None
+            self.has_custom_logit_processor = False
     @staticmethod
     def merge_bias_tensor(
         lhs: torch.Tensor,
@@ -221,9 +289,76 @@ class SamplingBatchInfo:
         return None
+    @staticmethod
+    def merge_custom_logit_processor(
+        lhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        rhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        bs1: int,
+        bs2: int,
+        device: str,
+    ):
+        if lhs is None and rhs is None:
+            return None
+        lhs, rhs = lhs or {}, rhs or {}
+        keys = set(lhs.keys()).union(set(rhs.keys()))
+        merged_dict = {}
+        for k in keys:
+            # Get the logit processor object
+            processor = lhs[k][0] if k in lhs else rhs[k][0]
+            # Get and merge the mask tensors from the two dicts
+            left_mask = (
+                lhs[k][1]
+                if k in lhs
+                else torch.zeros(bs1, dtype=torch.bool, device=device)
+            )
+            right_mask = (
+                rhs[k][1]
+                if k in rhs
+                else torch.zeros(bs2, dtype=torch.bool, device=device)
+            )
+            merged_dict[k] = (processor, torch.cat([left_mask, right_mask]))
+            assert merged_dict[k][1].shape[0] == bs1 + bs2, (
+                f"The batch size of merged mask ({merged_dict[k][1].shape[0]}) does not match "
+                f"the sum of the batch sizes of the two masks ({bs1 + bs2})"
+                f"\n{left_mask=}\n{right_mask=}\n{bs1=}\n{bs2=}"
+                f"\n{lhs=}\n{rhs=}"
+            )
+        return merged_dict
     def merge_batch(self, other: "SamplingBatchInfo"):
         self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+        # Merge the logit bias tensor
+        self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
+            self.logit_bias, other.logit_bias, len(self), len(other), self.device
+        )
+        # Merge the custom logit processors and custom params lists
+        if self.has_custom_logit_processor or other.has_custom_logit_processor:
+            # Merge the custom logit processors
+            self.custom_logit_processor = (
+                SamplingBatchInfo.merge_custom_logit_processor(
+                    self.custom_logit_processor,
+                    other.custom_logit_processor,
+                    len(self),
+                    len(other),
+                    self.device,
+                )
+            )
+            # Merge the custom params lists
+            self.custom_params = self.custom_params or [None] * len(self)
+            other.custom_params = other.custom_params or [None] * len(other)
+            self.custom_params.extend(other.custom_params)
+            # Set the flag to True if any of the two has custom logit processor
+            self.has_custom_logit_processor = True
+        # Note: becasue the __len()__ operator is defined on the temperatures tensor,
+        # please make sure any merge operation with len(self) or len(other) is done before
+        # the merge operation of the temperatures tensor below.
         for item in [
             "temperatures",
             "top_ps",
@@ -235,9 +370,6 @@ class SamplingBatchInfo:
             setattr(self, item, torch.concat([self_val, other_val]))
         self.is_all_greedy = self.is_all_greedy and other.is_all_greedy
-        self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
-            self.logit_bias, other.logit_bias, len(self), len(other), self.device
-        )
         self.need_min_p_sampling = self.need_min_p_sampling or other.need_min_p_sampling
     def apply_logits_bias(self, logits: torch.Tensor):
@@ -251,14 +383,7 @@ class SamplingBatchInfo:
         # repetition
         if self.scaling_penalties is not None:
-            if is_cuda:
-                logits[:] = sampling_scaling_penalties(logits, self.scaling_penalties)
-            else:
-                logits[:] = torch.where(
-                    logits > 0,
-                    logits / self.scaling_penalties,
-                    logits * self.scaling_penalties,
-                )
+            apply_scaling_penalties(logits, self.scaling_penalties)
         # Apply regex vocab_mask
         if self.vocab_mask is not None:

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # ==============================================================================
 """Sampling parameters for text generation."""
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 _SAMPLING_EPS = 1e-6
@@ -48,6 +48,7 @@ class SamplingParams:
         no_stop_trim: bool = False,
         ignore_eos: bool = False,
         skip_special_tokens: bool = True,
+        custom_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.temperature = temperature
         self.top_p = top_p
@@ -71,6 +72,7 @@ class SamplingParams:
         self.json_schema = json_schema
         self.ebnf = ebnf
         self.no_stop_trim = no_stop_trim
+        self.custom_params = custom_params
         # Process some special cases
         if self.temperature < _SAMPLING_EPS:

sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl