PyPI - sglang - Versions diffs - 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

sglang/api.py +1 -1
sglang/bench_offline_throughput.py +19 -0
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +123 -79
sglang/global_config.py +8 -3
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/ir.py +1 -1
sglang/srt/_custom_ops.py +83 -91
sglang/srt/configs/load_config.py +4 -1
sglang/srt/configs/model_config.py +48 -2
sglang/srt/configs/qwen2_5_vl_config.py +5 -2
sglang/srt/constrained/base_grammar_backend.py +117 -15
sglang/srt/constrained/llguidance_backend.py +151 -0
sglang/srt/constrained/outlines_backend.py +24 -33
sglang/srt/constrained/xgrammar_backend.py +69 -38
sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
sglang/srt/distributed/parallel_state.py +48 -3
sglang/srt/entrypoints/engine.py +67 -9
sglang/srt/entrypoints/http_server.py +190 -41
sglang/srt/entrypoints/verl_engine.py +147 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/activation.py +11 -0
sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
sglang/srt/layers/attention/flashinfer_backend.py +220 -378
sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
sglang/srt/layers/attention/torch_native_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +9 -6
sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
sglang/srt/layers/attention/utils.py +39 -0
sglang/srt/layers/attention/vision.py +60 -63
sglang/srt/layers/dp_attention.py +142 -1
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/linear.py +3 -1
sglang/srt/layers/logits_processor.py +281 -45
sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
sglang/srt/layers/moe/ep_moe/layer.py +140 -28
sglang/srt/layers/moe/fused_moe_native.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
sglang/srt/layers/moe/topk.py +13 -4
sglang/srt/layers/quantization/__init__.py +111 -7
sglang/srt/layers/quantization/blockwise_int8.py +409 -0
sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/quantization/fp8.py +69 -28
sglang/srt/layers/quantization/fp8_utils.py +17 -1
sglang/srt/layers/quantization/gptq.py +416 -0
sglang/srt/layers/quantization/int8_kernel.py +327 -0
sglang/srt/layers/quantization/int8_utils.py +73 -0
sglang/srt/layers/quantization/modelopt_quant.py +18 -1
sglang/srt/layers/radix_attention.py +1 -0
sglang/srt/layers/rotary_embedding.py +0 -1
sglang/srt/layers/sampler.py +76 -31
sglang/srt/layers/vocab_parallel_embedding.py +14 -13
sglang/srt/lora/lora.py +17 -1
sglang/srt/lora/lora_config.py +5 -0
sglang/srt/lora/lora_manager.py +1 -3
sglang/srt/managers/cache_controller.py +193 -62
sglang/srt/managers/configure_logging.py +2 -1
sglang/srt/managers/data_parallel_controller.py +6 -2
sglang/srt/managers/detokenizer_manager.py +124 -102
sglang/srt/managers/image_processor.py +2 -1
sglang/srt/managers/io_struct.py +143 -6
sglang/srt/managers/schedule_batch.py +237 -197
sglang/srt/managers/schedule_policy.py +29 -29
sglang/srt/managers/scheduler.py +681 -259
sglang/srt/managers/session_controller.py +6 -2
sglang/srt/managers/tokenizer_manager.py +224 -68
sglang/srt/managers/tp_worker.py +15 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/chunk_cache.py +18 -11
sglang/srt/mem_cache/hiradix_cache.py +394 -0
sglang/srt/mem_cache/memory_pool.py +44 -18
sglang/srt/mem_cache/radix_cache.py +58 -47
sglang/srt/metrics/collector.py +94 -36
sglang/srt/model_executor/cuda_graph_runner.py +55 -24
sglang/srt/model_executor/forward_batch_info.py +49 -16
sglang/srt/model_executor/model_runner.py +208 -28
sglang/srt/model_loader/loader.py +3 -3
sglang/srt/model_loader/weight_utils.py +36 -14
sglang/srt/models/baichuan.py +31 -6
sglang/srt/models/chatglm.py +39 -7
sglang/srt/models/commandr.py +29 -5
sglang/srt/models/dbrx.py +31 -5
sglang/srt/models/deepseek.py +43 -6
sglang/srt/models/deepseek_nextn.py +32 -19
sglang/srt/models/deepseek_v2.py +265 -32
sglang/srt/models/exaone.py +19 -9
sglang/srt/models/gemma.py +22 -8
sglang/srt/models/gemma2.py +25 -12
sglang/srt/models/gemma2_reward.py +5 -1
sglang/srt/models/gpt2.py +28 -13
sglang/srt/models/gpt_bigcode.py +27 -5
sglang/srt/models/granite.py +21 -9
sglang/srt/models/grok.py +21 -4
sglang/srt/models/internlm2.py +36 -6
sglang/srt/models/internlm2_reward.py +5 -1
sglang/srt/models/llama.py +26 -9
sglang/srt/models/llama_classification.py +5 -1
sglang/srt/models/llama_eagle.py +17 -4
sglang/srt/models/llama_embedding.py +5 -1
sglang/srt/models/llama_reward.py +7 -2
sglang/srt/models/llava.py +19 -3
sglang/srt/models/llavavid.py +10 -1
sglang/srt/models/minicpm.py +26 -2
sglang/srt/models/minicpm3.py +39 -3
sglang/srt/models/minicpmv.py +45 -14
sglang/srt/models/mixtral.py +20 -9
sglang/srt/models/mixtral_quant.py +50 -8
sglang/srt/models/mllama.py +57 -11
sglang/srt/models/olmo.py +34 -6
sglang/srt/models/olmo2.py +34 -13
sglang/srt/models/olmoe.py +26 -4
sglang/srt/models/phi3_small.py +29 -10
sglang/srt/models/qwen.py +26 -3
sglang/srt/models/qwen2.py +26 -4
sglang/srt/models/qwen2_5_vl.py +46 -8
sglang/srt/models/qwen2_eagle.py +17 -5
sglang/srt/models/qwen2_moe.py +44 -6
sglang/srt/models/qwen2_rm.py +78 -0
sglang/srt/models/qwen2_vl.py +39 -8
sglang/srt/models/stablelm.py +32 -5
sglang/srt/models/torch_native_llama.py +5 -2
sglang/srt/models/xverse.py +21 -9
sglang/srt/models/xverse_moe.py +45 -7
sglang/srt/models/yivl.py +2 -1
sglang/srt/openai_api/adapter.py +109 -24
sglang/srt/openai_api/protocol.py +17 -1
sglang/srt/reasoning_parser.py +154 -0
sglang/srt/sampling/penaltylib/__init__.py +4 -6
sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
sglang/srt/sampling/sampling_batch_info.py +79 -157
sglang/srt/sampling/sampling_params.py +16 -13
sglang/srt/server_args.py +136 -52
sglang/srt/speculative/build_eagle_tree.py +2 -8
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
sglang/srt/speculative/eagle_utils.py +92 -58
sglang/srt/speculative/eagle_worker.py +186 -94
sglang/srt/speculative/spec_info.py +1 -13
sglang/srt/utils.py +43 -17
sglang/srt/warmup.py +47 -0
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/runners.py +389 -126
sglang/test/send_one.py +88 -0
sglang/test/test_block_fp8_ep.py +361 -0
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +138 -84
sglang/utils.py +50 -60
sglang/version.py +1 -1
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +200 -166
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
sglang/bench_latency.py +0 -1
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
sglang/test/srt/sampling/penaltylib/utils.py +0 -344
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
{sglang-0.4.3.post2.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0

sglang/srt/_custom_ops.py CHANGED Viewed

@@ -1,21 +1,19 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
-import contextlib
-import functools
-import importlib
 import logging
 import os
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from typing import List, Tuple
 import torch
 import torch.library
-from sglang.srt.utils import is_hpu
+from sglang.srt.utils import is_hip, is_hpu
 logger = logging.getLogger(__name__)
 use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
 if not is_hpu():
-    if use_vllm_custom_allreduce:
+    # ROCm does not use vllm custom allreduce
+    if use_vllm_custom_allreduce and not is_hip():
         try:
             import vllm._C
         except ImportError as e:
@@ -27,37 +25,8 @@ if not is_hpu():
             logger.warning("Failed to import from custom_ar with %r", e)
-def hint_on_error(fn):
-    @functools.wraps(fn)
-    def wrapper(*args, **kwargs):
-        try:
-            return fn(*args, **kwargs)
-        except NotImplementedError as e:
-            msg = (
-                "Error in calling custom op %s: %s\n"
-                "Not implemented or built, mostly likely because the current current device "
-                "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
-                "incorrectly while building)"
-            )
-            logger.error(msg, fn.__name__, e)
-            raise NotImplementedError(msg % (fn.__name__, e)) from e
-        except AttributeError as e:
-            msg = (
-                "Error in calling custom op %s: %s\n"
-                "Possibly you have built or installed an obsolete version of vllm.\n"
-                "Please try a clean build and install of vllm,"
-                "or remove old built files such as vllm/*cpython*.so and build/ ."
-            )
-            logger.error(msg, fn.__name__, e)
-            raise e
-    return wrapper
-if use_vllm_custom_allreduce:
-    # custom ar
+if use_vllm_custom_allreduce and not is_hip():
+    # vLLM custom allreduce
     def init_custom_ar(
         ipc_tensors: List[torch.Tensor],
         rank_data: torch.Tensor,
@@ -95,62 +64,85 @@ if use_vllm_custom_allreduce:
         torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 else:
-    # custom ar
-    def init_custom_ar(
-        rank_id: int,
-        world_size: int,
-        rank_data_base: torch.Tensor,
-        buffers: List[int],
-        tmp_result_buffers: List[int],
-        barrier_in: List[int],
-        barrier_out: List[int],
-    ) -> int:
-        return sgl_kernel.ops.init_custom_reduce(
-            rank_id,
-            world_size,
-            rank_data_base,
-            buffers,
-            tmp_result_buffers,
-            barrier_in,
-            barrier_out,
-        )
+    if is_hip():
+        # ROCM custom allreduce
+        def init_custom_ar(
+            meta: torch.Tensor,
+            rank_data: torch.Tensor,
+            handles: List[str],
+            offsets: List[int],
+            rank: int,
+            full_nvlink: bool,
+        ) -> int:
+            return sgl_kernel.ops.allreduce.init_custom_ar(
+                meta, rank_data, handles, offsets, rank, full_nvlink
+            )
-    def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
-        sgl_kernel.ops.custom_reduce(fa, inp, out)
+        def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+            sgl_kernel.ops.allreduce.all_reduce_reg(fa, inp, out)
-    def dispose(fa: int) -> None:
-        sgl_kernel.ops.custom_dispose(fa)
+        def all_reduce_unreg(
+            fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
+        ) -> None:
+            sgl_kernel.ops.allreduce.all_reduce_unreg(fa, inp, reg_buffer, out)
-    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
-        return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
+        def dispose(fa: int) -> None:
+            sgl_kernel.ops.allreduce.dispose(fa)
-    def register_graph_buffers(
-        fa: int, handles: List[List[int]], offsets: List[List[int]]
-    ) -> None:
-        sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)
-# temporary fix for https://github.com/vllm-project/vllm/issues/5456
-# TODO: remove this in v0.6.0
-names_and_values = globals()
-names_and_values_to_update = {}
-# prepare variables to avoid dict size change during iteration
-k, v, arg = None, None, None
-fn_type = type(lambda x: x)
-for k, v in names_and_values.items():
-    # find functions that are defined in this file and have torch.Tensor
-    # in their annotations. `arg == "torch.Tensor"` is used to handle
-    # the case when users use `import __annotations__` to turn type
-    # hints into strings.
-    if (
-        isinstance(v, fn_type)
-        and v.__code__.co_filename == __file__
-        and any(
-            arg is torch.Tensor or arg == "torch.Tensor"
-            for arg in v.__annotations__.values()
-        )
-    ):
-        names_and_values_to_update[k] = hint_on_error(v)
+        def meta_size() -> int:
+            return sgl_kernel.ops.allreduce.meta_size()
+        def register_buffer(
+            fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
+        ) -> None:
+            return sgl_kernel.ops.allreduce.register_buffer(fa, t, handles, offsets)
+        def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
+            return sgl_kernel.ops.allreduce.get_graph_buffer_ipc_meta(fa)
+        def register_graph_buffers(
+            fa: int, handles: List[str], offsets: List[List[int]]
+        ) -> None:
+            sgl_kernel.ops.allreduce.register_graph_buffers(fa, handles, offsets)
+        def allocate_meta_buffer(size: int) -> torch.Tensor:
+            return sgl_kernel.ops.allreduce.allocate_meta_buffer(size)
+        def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
+            return sgl_kernel.ops.allreduce.get_meta_buffer_ipc_handle(inp)
+    else:
+        # TRTLLM custom allreduce
+        def init_custom_ar(
+            rank_id: int,
+            world_size: int,
+            rank_data_base: torch.Tensor,
+            buffers: List[int],
+            tmp_result_buffers: List[int],
+            barrier_in: List[int],
+            barrier_out: List[int],
+        ) -> int:
+            return sgl_kernel.ops.init_custom_reduce(
+                rank_id,
+                world_size,
+                rank_data_base,
+                buffers,
+                tmp_result_buffers,
+                barrier_in,
+                barrier_out,
+            )
+        def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+            sgl_kernel.ops.custom_reduce(fa, inp, out)
+        def dispose(fa: int) -> None:
+            sgl_kernel.ops.custom_dispose(fa)
+        def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+            return sgl_kernel.ops.get_graph_buffer_ipc_meta(fa)
-names_and_values.update(names_and_values_to_update)
-del names_and_values_to_update, names_and_values, v, k, fn_type
+        def register_graph_buffers(
+            fa: int, handles: List[List[int]], offsets: List[List[int]]
+        ) -> None:
+            sgl_kernel.ops.register_graph_buffers(fa, handles, offsets)

sglang/srt/configs/load_config.py CHANGED Viewed

@@ -21,6 +21,7 @@ class LoadFormat(str, enum.Enum):
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
     LAYERED = "layered"
+    JAX = "jax"
 @dataclass
@@ -42,13 +43,15 @@ class LoadConfig:
     ignore_patterns: The list of patterns to ignore when loading the model.
         Default to "original/**/*" to avoid repeated loading of llama's
         checkpoints.
+    decryption_key_file: If set, decrypts the output files with a password read
+        from this file (after PBKDF2).
     """
     load_format: Union[str, LoadFormat] = LoadFormat.AUTO
     download_dir: Optional[str] = None
     model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
     ignore_patterns: Optional[Union[List[str], str]] = None
+    decryption_key_file: Optional[str] = None
     def __post_init__(self):
         model_loader_extra_config = self.model_loader_extra_config or {}

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import json
 import logging
+import math
 from enum import IntEnum, auto
 from typing import List, Optional, Set, Union
@@ -39,10 +40,11 @@ class ModelConfig:
         trust_remote_code: bool = True,
         revision: Optional[str] = None,
         context_length: Optional[int] = None,
-        model_override_args: Optional[dict] = None,
+        model_override_args: Optional[str] = None,
         is_embedding: Optional[bool] = None,
         dtype: str = "auto",
         quantization: Optional[str] = None,
+        override_config_file: Optional[str] = None,
     ) -> None:
         self.model_path = model_path
         self.revision = revision
@@ -50,11 +52,16 @@ class ModelConfig:
         # Parse args
         self.model_override_args = json.loads(model_override_args)
+        kwargs = {}
+        if override_config_file and override_config_file.strip():
+            kwargs["_configuration_file"] = override_config_file.strip()
         self.hf_config = get_config(
             model_path,
             trust_remote_code=trust_remote_code,
             revision=revision,
             model_override_args=self.model_override_args,
+            **kwargs,
         )
         self.hf_text_config = get_hf_text_config(self.hf_config)
@@ -63,6 +70,9 @@ class ModelConfig:
             self.hf_config.architectures, is_embedding
         )
         self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
+        self.is_multimodal_gen = is_multimodal_gen_model(self.hf_config.architectures)
+        self.is_image_gen = is_image_gen_model(self.hf_config.architectures)
+        self.is_audio_model = is_audio_model(self.hf_config.architectures)
         self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
@@ -70,7 +80,9 @@ class ModelConfig:
         derived_context_len = get_context_length(self.hf_text_config)
         if context_length is not None:
             if context_length > derived_context_len:
-                if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
+                if get_bool_env_var(
+                    "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", default="False"
+                ):
                     logger.warning(
                         f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
                         f"This may lead to incorrect model outputs or CUDA errors."
@@ -103,7 +115,20 @@ class ModelConfig:
             self.head_dim = 256
             self.attention_arch = AttentionArch.MLA
             self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
             self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_config.v_head_dim
+            # Handle rope scaling with yarn
+            self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
+            if self.hf_config.rope_scaling:
+                mscale_all_dim = self.hf_config.rope_scaling.get(
+                    "mscale_all_dim", False
+                )
+                scaling_factor = self.hf_config.rope_scaling["factor"]
+                mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+                self.scaling = self.scaling * mscale * mscale
         elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
             self.head_dim = 128
             self.attention_arch = AttentionArch.MLA
@@ -389,6 +414,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
         or "LlamaForSequenceClassification" in model_architectures
         or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
         or "InternLM2ForRewardModel" in model_architectures
+        or "Qwen2ForRewardModel" in model_architectures
     ):
         return False
     else:
@@ -401,6 +427,8 @@ def is_multimodal_model(model_architectures: List[str]):
         or "LlavaQwenForCausalLM" in model_architectures
         or "LlavaMistralForCausalLM" in model_architectures
         or "LlavaVidForCausalLM" in model_architectures
+        or "Grok1VForCausalLM" in model_architectures
+        or "Grok1AForCausalLM" in model_architectures
         or "MllamaForConditionalGeneration" in model_architectures
         or "Qwen2VLForConditionalGeneration" in model_architectures
         or "Qwen2_5_VLForConditionalGeneration" in model_architectures
@@ -411,5 +439,23 @@ def is_multimodal_model(model_architectures: List[str]):
         return False
+def is_multimodal_gen_model(model_architectures: List[str]):
+    return False
+def is_image_gen_model(model_architectures: List[str]):
+    return False
+def is_audio_model(model_architectures: List[str]):
+    return False
 def is_encoder_decoder_model(model_architectures: List[str]):
     return "MllamaForConditionalGeneration" in model_architectures
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0

sglang/srt/configs/qwen2_5_vl_config.py CHANGED Viewed

@@ -48,13 +48,16 @@ from transformers.image_utils import (
     validate_preprocess_arguments,
 )
 from transformers.modeling_rope_utils import rope_config_validation
-from transformers.models.mllama.image_processing_mllama import is_valid_list_of_images
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from transformers.processing_utils import ProcessingKwargs, Unpack, VideosKwargs
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+def is_valid_list_of_images(images: List):
+    return images and all(is_valid_image(image) for image in images)
 class Qwen2_5_VLVisionConfig(PretrainedConfig):
     model_type = "qwen2_5_vl"
     base_config_key = "vision_config"
@@ -999,5 +1002,5 @@ class Qwen2_5_VLImageProcessor(BaseImageProcessor):
         return BatchFeature(data=data, tensor_type=return_tensors)
-AutoImageProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLImageProcessor)
+AutoImageProcessor.register(Qwen2_5_VLConfig, None, Qwen2_5_VLImageProcessor, None)
 AutoProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLProcessor)

sglang/srt/constrained/base_grammar_backend.py CHANGED Viewed

@@ -13,31 +13,130 @@
 # ==============================================================================
 """The baseclass of a backend for grammar-guided constrained decoding."""
+import logging
+from abc import ABC, abstractmethod
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
 from threading import Event, Lock
-from typing import Any, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+import torch
 from sglang.srt.server_args import ServerArgs
+logger = logging.getLogger(__name__)
+class BaseGrammarObject(ABC):
+    @abstractmethod
+    def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
+        """
+        Try to jump forward in the grammar.
+        Returns:
+            A jump forward helper which may be used in `jump_forward_str_state`.
+            None if the jump forward is not possible.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
+        """
+        Jump forward for the grammar.
+        Returns:
+            A tuple of the jump forward string and the next state of the grammar
+            (which can be used in `jump_and_retokenize` if needed).
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def jump_and_retokenize(
+        self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
+    ) -> None:
+        """
+        Jump forward occurs, and update the grammar state if needed.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    @abstractmethod
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
+        raise NotImplementedError
+    @staticmethod
+    @abstractmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def copy(self) -> "BaseGrammarObject":
+        raise NotImplementedError
 @dataclass
 class CacheEntry:
-    value: Any
+    value: Optional[BaseGrammarObject]
     event: Event
-class BaseGrammarObject:
-    pass
-class BaseGrammarBackend:
+class BaseGrammarBackend(ABC):
     def __init__(self):
         self.executor = ThreadPoolExecutor()
-        self.cache = {}
+        self.cache: Dict[Tuple[str, str], CacheEntry] = {}
         self.cache_lock = Lock()
-    def init_value(self, key: Tuple[str, str]) -> BaseGrammarObject:
+    def _not_supported(self, key_type: str, key_string: str) -> None:
+        logger.warning(f"Skip unsupported {key_type}: {key_type}={key_string}")
+    def dispatch_fallback(
+        self, key_type: str, key_string: str
+    ) -> Optional[BaseGrammarObject]:
+        """
+        This function should not be reached in any case.
+        """
+        raise ValueError(f"Invalid key_type: {key_type}={key_string}")
+    @abstractmethod
+    def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
+        return self._not_supported("json", key_string)
+    @abstractmethod
+    def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
+        return self._not_supported("regex", key_string)
+    @abstractmethod
+    def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
+        return self._not_supported("ebnf", key_string)
+    @abstractmethod
+    def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
+        return self._not_supported("structural_tag", key_string)
+    def _init_value_dispatch(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
+        key_type, key_string = key
+        if key_type == "json":
+            return self.dispatch_json(key_string)
+        elif key_type == "regex":
+            return self.dispatch_regex(key_string)
+        elif key_type == "ebnf":
+            return self.dispatch_ebnf(key_string)
+        elif key_type == "structural_tag":
+            return self.dispatch_structural_tag(key_string)
+        else:
+            return self.dispatch_fallback(key_type, key_string)
+    def _init_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
         with self.cache_lock:
             if key in self.cache:
                 cache_hit = True
@@ -50,13 +149,10 @@ class BaseGrammarBackend:
         if cache_hit:
             entry.event.wait()
         else:
-            entry.value = self.init_value_impl(key)
+            entry.value = self._init_value_dispatch(key)
             entry.event.set()
         return entry.value.copy() if entry.value else None
-    def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
-        raise NotImplementedError()
     def get_cached_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
         with self.cache_lock:
             entry = self.cache.get(key)
@@ -66,7 +162,7 @@ class BaseGrammarBackend:
             return val.copy() if val else None
     def get_future_value(self, key: Tuple[str, str]) -> Future:
-        return self.executor.submit(self.init_value, key)
+        return self.executor.submit(self._init_value, key)
     def reset(self):
         with self.cache_lock:
@@ -80,12 +176,18 @@ def create_grammar_backend(server_args: ServerArgs, tokenizer, vocab_size):
         grammar_backend = OutlinesGrammarBackend(
             tokenizer,
             whitespace_pattern=server_args.constrained_json_whitespace_pattern,
-            allow_jump_forward=not server_args.disable_jump_forward,
         )
     elif server_args.grammar_backend == "xgrammar":
         from sglang.srt.constrained.xgrammar_backend import XGrammarGrammarBackend
         grammar_backend = XGrammarGrammarBackend(tokenizer, vocab_size=vocab_size)
+    elif server_args.grammar_backend == "llguidance":
+        from sglang.srt.constrained.llguidance_backend import GuidanceBackend
+        grammar_backend = GuidanceBackend(
+            tokenizer=tokenizer,
+            whitespace_pattern=server_args.constrained_json_whitespace_pattern,
+        )
     else:
         raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")

sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post2py3-none-any.whl → 0.4.3.post3py3-none-any.whl