PyPI - sglang - Versions diffs - 0.4.1.post5__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl - Mend

sglang 0.4.1.post5py3-none-any.whl → 0.4.1.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +24 -16
sglang/bench_one_batch.py +51 -3
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +37 -28
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +15 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/model_config.py +16 -6
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +449 -0
sglang/srt/entrypoints/http_server.py +579 -0
sglang/srt/layers/activation.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +27 -12
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +69 -0
sglang/srt/layers/linear.py +76 -102
sglang/srt/layers/logits_processor.py +48 -63
sglang/srt/layers/moe/ep_moe/layer.py +4 -4
sglang/srt/layers/moe/fused_moe_native.py +69 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -14
sglang/srt/layers/moe/topk.py +4 -2
sglang/srt/layers/parameter.py +26 -17
sglang/srt/layers/quantization/__init__.py +22 -23
sglang/srt/layers/quantization/fp8.py +112 -55
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/int8_kernel.py +54 -0
sglang/srt/layers/quantization/modelopt_quant.py +2 -3
sglang/srt/layers/quantization/w8a8_int8.py +117 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/rotary_embedding.py +1179 -31
sglang/srt/layers/sampler.py +39 -1
sglang/srt/layers/vocab_parallel_embedding.py +17 -4
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +46 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +23 -8
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +54 -15
sglang/srt/managers/schedule_batch.py +49 -22
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +319 -181
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +303 -158
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +110 -77
sglang/srt/metrics/collector.py +25 -11
sglang/srt/model_executor/cuda_graph_runner.py +4 -6
sglang/srt/model_executor/model_runner.py +80 -21
sglang/srt/model_loader/loader.py +8 -6
sglang/srt/model_loader/weight_utils.py +55 -2
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +3 -3
sglang/srt/models/dbrx.py +4 -4
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +8 -8
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +6 -24
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +3 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +41 -4
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +6 -6
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +52 -4
sglang/srt/models/qwen2_eagle.py +131 -0
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +3 -3
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +15 -5
sglang/srt/sampling/sampling_batch_info.py +153 -9
sglang/srt/sampling/sampling_params.py +4 -2
sglang/srt/server.py +4 -1037
sglang/srt/server_args.py +84 -32
sglang/srt/speculative/eagle_worker.py +1 -0
sglang/srt/torch_memory_saver_adapter.py +59 -0
sglang/srt/utils.py +130 -63
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +26 -13
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +126 -117
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post5.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0

sglang/srt/models/stablelm.py CHANGED Viewed

@@ -24,9 +24,8 @@ from typing import Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.rotary_embedding import get_rope
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
@@ -36,6 +35,7 @@ from sglang.srt.layers.linear import (
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,

sglang/srt/models/torch_native_llama.py CHANGED Viewed

@@ -47,17 +47,17 @@ import torch
 from torch import nn
 from torch.nn.parameter import Parameter
 from transformers import LlamaConfig
-from vllm.distributed import (
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,

sglang/srt/models/xverse.py CHANGED Viewed

@@ -21,19 +21,19 @@ from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import LlamaConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
     QKVParallelLinear,
     RowParallelLinear,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,

sglang/srt/models/xverse_moe.py CHANGED Viewed

@@ -18,25 +18,25 @@ from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
-from vllm.distributed import (
+from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,
     QKVParallelLinear,
     ReplicatedLinear,
     RowParallelLinear,
 )
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import fused_moe
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -180,6 +180,7 @@ class CompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
 class CompletionResponseChoice(BaseModel):
@@ -322,6 +323,7 @@ class ChatCompletionRequest(BaseModel):
     ignore_eos: bool = False
     skip_special_tokens: bool = True
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
 class FunctionResponse(BaseModel):

sglang/srt/sampling/custom_logit_processor.py ADDED Viewed

@@ -0,0 +1,38 @@
+import json
+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import Any, Dict, List, Optional
+import dill
+import torch
+@lru_cache(maxsize=None)
+def _cache_from_str(json_str: str):
+    """Deserialize a json string to a Callable object.
+    This function is cached to avoid redundant deserialization.
+    """
+    data = json.loads(json_str)
+    return dill.loads(bytes.fromhex(data["callable"]))
+class CustomLogitProcessor(ABC):
+    """Abstract base class for callable functions."""
+    @abstractmethod
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        """Define the callable behavior."""
+        raise NotImplementedError
+    def to_str(self) -> str:
+        """Serialize the callable function to a JSON-compatible string."""
+        return json.dumps({"callable": dill.dumps(self).hex()})
+    @classmethod
+    def from_str(cls, json_str: str):
+        """Deserialize a callable function from a JSON string."""
+        return _cache_from_str(json_str)

sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py CHANGED Viewed

@@ -3,6 +3,11 @@ from typing import List
 import torch
 from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer, _TokenIDs
+from sglang.srt.utils import is_cuda_available
+is_cuda = is_cuda_available()
+if is_cuda:
+    from sgl_kernel import sampling_scaling_penalties
 class BatchedRepetitionPenalizer(_BatchedPenalizer):
@@ -56,11 +61,16 @@ class BatchedRepetitionPenalizer(_BatchedPenalizer):
         self.cumulated_repetition_penalties[mask] = self.repetition_penalties[mask]
     def _apply(self, logits: torch.Tensor) -> torch.Tensor:
-        return torch.where(
-            logits > 0,
-            logits / self.cumulated_repetition_penalties,
-            logits * self.cumulated_repetition_penalties,
-        )
+        if is_cuda:
+            return sampling_scaling_penalties(
+                logits, self.cumulated_repetition_penalties
+            )
+        else:
+            return torch.where(
+                logits > 0,
+                logits / self.cumulated_repetition_penalties,
+                logits * self.cumulated_repetition_penalties,
+            )
     def _filter(self, indices_to_keep: List[int], indices_tensor_to_keep: torch.Tensor):
         self.repetition_penalties = self.repetition_penalties[indices_tensor_to_keep]

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -3,11 +3,18 @@ from __future__ import annotations
 import dataclasses
 import logging
 import threading
-from typing import TYPE_CHECKING, Callable, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
 import torch
+from sglang.srt.utils import is_cuda_available
+is_cuda = is_cuda_available()
+if is_cuda:
+    from sgl_kernel import sampling_scaling_penalties
 import sglang.srt.sampling.penaltylib as penaltylib
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
 logger = logging.getLogger(__name__)
@@ -30,6 +37,9 @@ class SamplingBatchInfo:
     # Dispatch in CUDA graph
     need_min_p_sampling: bool
+    # Whether any request has custom logit processor
+    has_custom_logit_processor: bool
     # Bias Tensors
     vocab_size: int
     grammars: Optional[List] = None
@@ -46,6 +56,14 @@ class SamplingBatchInfo:
     # Device
     device: str = "cuda"
+    # Custom Parameters
+    custom_params: Optional[List[Optional[Dict[str, Any]]]] = None
+    # Custom Logit Processor
+    custom_logit_processor: Optional[
+        Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]
+    ] = None
     @classmethod
     def from_schedule_batch(
         cls, batch: ScheduleBatch, vocab_size: int, enable_overlap_schedule: bool
@@ -70,6 +88,39 @@ class SamplingBatchInfo:
             [r.sampling_params.min_p for r in reqs], dtype=torch.float
         ).to(device, non_blocking=True)
+        # Check if any request has custom logit processor
+        has_custom_logit_processor = (
+            batch.enable_custom_logit_processor  # check the flag first.
+            and any(r.custom_logit_processor for r in reqs)  # then check the requests.
+        )
+        if has_custom_logit_processor:
+            # Merge the same type of custom logit processors together
+            processor_dict = {}
+            for i, r in enumerate(reqs):
+                if r.custom_logit_processor is None:
+                    continue
+                processor_str = r.custom_logit_processor
+                if processor_str not in processor_dict:
+                    processor_dict[processor_str] = []
+                processor_dict[processor_str].append(i)
+            merged_custom_logit_processor = {
+                hash(processor_str): (
+                    # The deserialized custom logit processor object
+                    CustomLogitProcessor.from_str(processor_str),
+                    # The mask tensor for the requests that use this custom logit processor
+                    torch.zeros(len(reqs), dtype=torch.bool)
+                    .scatter_(0, torch.tensor(true_indices), True)
+                    .to(device, non_blocking=True),
+                )
+                for processor_str, true_indices in processor_dict.items()
+            }
+            custom_params = [r.sampling_params.custom_params for r in reqs]
+        else:
+            merged_custom_logit_processor = None
+            custom_params = None
         ret = cls(
             temperatures=temperatures,
             top_ps=top_ps,
@@ -77,8 +128,11 @@ class SamplingBatchInfo:
             min_ps=min_ps,
             need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
             is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
+            has_custom_logit_processor=has_custom_logit_processor,
             vocab_size=vocab_size,
             device=device,
+            custom_params=custom_params,
+            custom_logit_processor=merged_custom_logit_processor,
         )
         # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
@@ -178,6 +232,8 @@ class SamplingBatchInfo:
     def filter_batch(self, unfinished_indices: List[int], new_indices: torch.Tensor):
         self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
+        if self.has_custom_logit_processor:
+            self._filter_batch_custom_logit_processor(unfinished_indices, new_indices)
         for item in [
             "temperatures",
@@ -190,6 +246,27 @@ class SamplingBatchInfo:
             if value is not None:  # logit_bias can be None
                 setattr(self, item, value[new_indices])
+    def _filter_batch_custom_logit_processor(
+        self, unfinished_indices: List[int], new_indices: torch.Tensor
+    ):
+        """Filter the custom logit processor and custom params"""
+        self.custom_logit_processor = {
+            k: (p, mask[new_indices])
+            for k, (p, mask) in self.custom_logit_processor.items()
+            if any(
+                mask[new_indices]
+            )  # ignore the custom logit processor whose mask is all False
+        }
+        self.custom_params = [self.custom_params[i] for i in unfinished_indices]
+        # If the custom logit processor is an empty dict, set the flag to False,
+        # and set the custom logit processor and custom params to None.
+        if len(self.custom_logit_processor) == 0:
+            self.custom_logit_processor = None
+            self.custom_params = None
+            self.has_custom_logit_processor = False
     @staticmethod
     def merge_bias_tensor(
         lhs: torch.Tensor,
@@ -215,9 +292,76 @@ class SamplingBatchInfo:
         return None
+    @staticmethod
+    def merge_custom_logit_processor(
+        lhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        rhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        bs1: int,
+        bs2: int,
+        device: str,
+    ):
+        if lhs is None and rhs is None:
+            return None
+        lhs, rhs = lhs or {}, rhs or {}
+        keys = set(lhs.keys()).union(set(rhs.keys()))
+        merged_dict = {}
+        for k in keys:
+            # Get the logit processor object
+            processor = lhs[k][0] if k in lhs else rhs[k][0]
+            # Get and merge the mask tensors from the two dicts
+            left_mask = (
+                lhs[k][1]
+                if k in lhs
+                else torch.zeros(bs1, dtype=torch.bool, device=device)
+            )
+            right_mask = (
+                rhs[k][1]
+                if k in rhs
+                else torch.zeros(bs2, dtype=torch.bool, device=device)
+            )
+            merged_dict[k] = (processor, torch.cat([left_mask, right_mask]))
+            assert merged_dict[k][1].shape[0] == bs1 + bs2, (
+                f"The batch size of merged mask ({merged_dict[k][1].shape[0]}) does not match "
+                f"the sum of the batch sizes of the two masks ({bs1 + bs2})"
+                f"\n{left_mask=}\n{right_mask=}\n{bs1=}\n{bs2=}"
+                f"\n{lhs=}\n{rhs=}"
+            )
+        return merged_dict
     def merge_batch(self, other: "SamplingBatchInfo"):
         self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+        # Merge the logit bias tensor
+        self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
+            self.logit_bias, other.logit_bias, len(self), len(other), self.device
+        )
+        # Merge the custom logit processors and custom params lists
+        if self.has_custom_logit_processor or other.has_custom_logit_processor:
+            # Merge the custom logit processors
+            self.custom_logit_processor = (
+                SamplingBatchInfo.merge_custom_logit_processor(
+                    self.custom_logit_processor,
+                    other.custom_logit_processor,
+                    len(self),
+                    len(other),
+                    self.device,
+                )
+            )
+            # Merge the custom params lists
+            self.custom_params = self.custom_params or [None] * len(self)
+            other.custom_params = other.custom_params or [None] * len(other)
+            self.custom_params.extend(other.custom_params)
+            # Set the flag to True if any of the two has custom logit processor
+            self.has_custom_logit_processor = True
+        # Note: becasue the __len()__ operator is defined on the temperatures tensor,
+        # please make sure any merge operation with len(self) or len(other) is done before
+        # the merge operation of the temperatures tensor below.
         for item in [
             "temperatures",
             "top_ps",
@@ -229,9 +373,6 @@ class SamplingBatchInfo:
             setattr(self, item, torch.concat([self_val, other_val]))
         self.is_all_greedy = self.is_all_greedy and other.is_all_greedy
-        self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
-            self.logit_bias, other.logit_bias, len(self), len(other), self.device
-        )
         self.need_min_p_sampling = self.need_min_p_sampling or other.need_min_p_sampling
     def apply_logits_bias(self, logits: torch.Tensor):
@@ -245,11 +386,14 @@ class SamplingBatchInfo:
         # repetition
         if self.scaling_penalties is not None:
-            logits[:] = torch.where(
-                logits > 0,
-                logits / self.scaling_penalties,
-                logits * self.scaling_penalties,
-            )
+            if is_cuda:
+                logits[:] = sampling_scaling_penalties(logits, self.scaling_penalties)
+            else:
+                logits[:] = torch.where(
+                    logits > 0,
+                    logits / self.scaling_penalties,
+                    logits * self.scaling_penalties,
+                )
         # Apply regex vocab_mask
         if self.vocab_mask is not None:

sglang/srt/sampling/sampling_params.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # ==============================================================================
 """Sampling parameters for text generation."""
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 _SAMPLING_EPS = 1e-6
@@ -23,7 +23,7 @@ class SamplingParams:
     The sampling parameters.
     See docs/references/sampling_params.md or
-    https://sgl-project.github.io/references/sampling_params.html
+    https://docs.sglang.ai/references/sampling_params.html
     for the documentation.
     """
@@ -48,6 +48,7 @@ class SamplingParams:
         no_stop_trim: bool = False,
         ignore_eos: bool = False,
         skip_special_tokens: bool = True,
+        custom_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.temperature = temperature
         self.top_p = top_p
@@ -71,6 +72,7 @@ class SamplingParams:
         self.json_schema = json_schema
         self.ebnf = ebnf
         self.no_stop_trim = no_stop_trim
+        self.custom_params = custom_params
         # Process some special cases
         if self.temperature < _SAMPLING_EPS:

sglang 0.4.1.post5__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl

sglang 0.4.1.post5py3-none-any.whl → 0.4.1.post7py3-none-any.whl