PyPI - sglang - Versions diffs - 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

sglang/__init__.py +2 -4
sglang/bench_one_batch.py +2 -2
sglang/bench_serving.py +3 -6
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/anthropic.py +0 -4
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/openai.py +6 -2
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/lang/backend/vertexai.py +0 -1
sglang/lang/compiler.py +1 -7
sglang/lang/tracer.py +3 -7
sglang/srt/_custom_ops.py +0 -2
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/outlines_jump_forward.py +14 -1
sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
sglang/srt/constrained/xgrammar_backend.py +27 -4
sglang/srt/custom_op.py +0 -62
sglang/srt/disaggregation/decode.py +105 -6
sglang/srt/disaggregation/mini_lb.py +74 -9
sglang/srt/disaggregation/mooncake/conn.py +33 -63
sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +137 -17
sglang/srt/disaggregation/utils.py +32 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/entrypoints/verl_engine.py +7 -5
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +6 -8
sglang/srt/layers/attention/flashattention_backend.py +883 -209
sglang/srt/layers/attention/flashinfer_backend.py +5 -2
sglang/srt/layers/attention/torch_native_backend.py +6 -1
sglang/srt/layers/attention/triton_backend.py +6 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +20 -5
sglang/srt/layers/linear.py +17 -3
sglang/srt/layers/moe/ep_moe/layer.py +17 -29
sglang/srt/layers/moe/fused_moe_native.py +4 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +27 -30
sglang/srt/layers/parameter.py +0 -2
sglang/srt/layers/quantization/__init__.py +1 -0
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8.py +115 -132
sglang/srt/layers/quantization/fp8_kernel.py +213 -88
sglang/srt/layers/quantization/fp8_utils.py +189 -264
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/moe_wna16.py +2 -0
sglang/srt/layers/quantization/utils.py +5 -11
sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
sglang/srt/layers/quantization/w8a8_int8.py +7 -7
sglang/srt/layers/radix_attention.py +15 -0
sglang/srt/layers/rotary_embedding.py +9 -8
sglang/srt/layers/sampler.py +7 -12
sglang/srt/lora/backend/base_backend.py +18 -2
sglang/srt/lora/backend/flashinfer_backend.py +1 -1
sglang/srt/lora/backend/triton_backend.py +1 -1
sglang/srt/lora/layers.py +1 -1
sglang/srt/lora/lora.py +1 -1
sglang/srt/lora/lora_manager.py +1 -1
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/detokenizer_manager.py +0 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +4 -3
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
sglang/srt/managers/schedule_batch.py +15 -4
sglang/srt/managers/scheduler.py +28 -77
sglang/srt/managers/tokenizer_manager.py +116 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +41 -29
sglang/srt/mem_cache/memory_pool.py +38 -15
sglang/srt/model_executor/cuda_graph_runner.py +15 -10
sglang/srt/model_executor/model_runner.py +39 -31
sglang/srt/models/bert.py +398 -0
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_nextn.py +74 -70
sglang/srt/models/deepseek_v2.py +292 -348
sglang/srt/models/llama.py +5 -5
sglang/srt/models/minicpm3.py +31 -203
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_moe.py +14 -13
sglang/srt/models/qwen3.py +335 -0
sglang/srt/models/qwen3_moe.py +423 -0
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/reasoning_parser.py +0 -1
sglang/srt/sampling/sampling_batch_info.py +2 -3
sglang/srt/server_args.py +86 -72
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +6 -14
sglang/srt/utils.py +62 -6
sglang/test/runners.py +5 -1
sglang/test/test_block_fp8.py +167 -0
sglang/test/test_custom_ops.py +1 -1
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
sglang/lang/__init__.py +0 -0
sglang/srt/lora/backend/__init__.py +0 -25
sglang/srt/server.py +0 -18
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -24,6 +24,7 @@ from sglang.api import (
     user_end,
     video,
 )
+from sglang.global_config import global_config
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.choices import (
     greedy_token_selection,
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
     unconditional_likelihood_normalized,
 )
 from sglang.utils import LazyImport
+from sglang.version import __version__
 ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
 Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
@@ -38,10 +40,6 @@ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
 OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
 VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
-# Other configs
-from sglang.global_config import global_config
-from sglang.version import __version__
 __all__ = [
     "Engine",
     "Runtime",

sglang/bench_one_batch.py CHANGED Viewed

@@ -207,7 +207,7 @@ def prepare_extend_inputs_for_correctness_test(
 def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
-    input_ids = np.ones((batch_size, input_len), dtype=np.int32)
+    input_ids = np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
     sampling_params = SamplingParams(
         temperature=0,
         max_new_tokens=BenchArgs.output_len,
@@ -396,7 +396,7 @@ def latency_test_run_once(
         decode_latencies.append(latency)
         if i < 5:
             rank_print(
-                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+                f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
     if profile:

sglang/bench_serving.py CHANGED Viewed

@@ -690,7 +690,6 @@ def sample_random_requests(
     dataset_path: str,
     random_sample: bool = True,
 ) -> List[Tuple[str, int, int]]:
     input_lens = np.random.randint(
         max(int(input_len * range_ratio), 1),
         input_len + 1,
@@ -707,10 +706,6 @@ def sample_random_requests(
         # Download sharegpt if necessary
         if not os.path.isfile(dataset_path):
-            print(
-                "If you do not want to randomly sample from a dataset,"
-                " please use --dataset-name random-ids."
-            )
             dataset_path = download_and_cache_file(SHAREGPT_URL)
         # Load the dataset.
@@ -1029,7 +1024,9 @@ async def benchmark(
     warmup_outputs = await asyncio.gather(*warmup_tasks)
     # Check if at least one warmup request succeeded
-    if not any(output.success for output in warmup_outputs):
+    if args.warmup_requests > 0 and not any(
+        output.success for output in warmup_outputs
+    ):
         raise ValueError(
             "Warmup failed - Please make sure benchmark arguments "
             f"are correctly specified. Error: {warmup_outputs[0].error}"

sglang/compile_deep_gemm.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""
+Compile DeepGEMM Kernels for a model with specify server arguments
+This script launches a server for capturing DeepGEMM calls and then compiles the kernels.
+It accepts server arguments (the same as launch_server.py).
+Usage:
+python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
+"""
+import argparse
+import dataclasses
+import multiprocessing
+import os
+import time
+import requests
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_process_tree
+from sglang.srt.warmup import warmup
+multiprocessing.set_start_method("spawn", force=True)
+# Reduce warning
+os.environ["SGL_IN_DEEP_GEMM_PRE_COMPILE_STAGE"] = "1"
+@dataclasses.dataclass
+class CompileArgs:
+    timeout: int = 3600
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--timeout", type=int, default=CompileArgs.timeout)
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to cast the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+@warmup("compile-deep-gemm")
+async def warm_up_compile(tokenizer_manager: TokenizerManager):
+    print("\nGenerate warm up request for compiling DeepGEMM...\n")
+    generate_req_input = GenerateReqInput(
+        input_ids=[0, 1, 2, 3],
+        sampling_params={
+            "temperature": 0.0,
+            "max_new_tokens": 8,
+            "ignore_eos": True,
+        },
+    )
+    await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
+def launch_server_internal(server_args):
+    try:
+        launch_server(server_args)
+    except Exception as e:
+        raise e
+    finally:
+        kill_process_tree(os.getpid(), include_parent=False)
+def launch_server_process_and_send_one_request(
+    server_args: ServerArgs, compile_args: CompileArgs
+):
+    proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
+    proc.start()
+    base_url = f"http://{server_args.host}:{server_args.port}"
+    timeout = compile_args.timeout
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            headers = {
+                "Content-Type": "application/json; charset=utf-8",
+            }
+            response = requests.get(f"{base_url}/v1/models", headers=headers)
+            if response.status_code == 200:
+                return proc
+        except requests.RequestException:
+            pass
+        time.sleep(10)
+    raise TimeoutError(
+        "DeepGEMM Kernels compilation timeout."
+        "\n\nFeel free and please restart the command."
+    )
+def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
+    # Disbale cuda graph and torch compile to save time
+    server_args.disable_cuda_graph = True
+    server_args.enable_torch_compile = False
+    print(f"Disable CUDA Graph and Torch Compile to save time...")
+    # Set watchdog timeout to compile_args.timeout because compilation will take a long time
+    server_args.watchdog_timeout = compile_args.timeout
+    server_args.warmups = "compile-deep-gemm"
+def run_compile(server_args: ServerArgs, compile_args: CompileArgs):
+    print(
+        "Begin DeepGEMM Kernels compilation...\n"
+        "It may take a long time and timeout maybe raised "
+        "while the compilation is still in progress.\n"
+        "Just feel free to restart the command "
+        "until the compilation is fully finished.\n"
+    )
+    proc = launch_server_process_and_send_one_request(server_args, compile_args)
+    kill_process_tree(proc.pid)
+    print("\nDeepGEMM Kernels compilation finished successfully.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    CompileArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    compile_args = CompileArgs.from_cli_args(args)
+    refine_server_args(server_args, compile_args)
+    run_compile(server_args, compile_args)

sglang/lang/backend/anthropic.py CHANGED Viewed

@@ -1,7 +1,3 @@
-from typing import List, Optional, Union
-import numpy as np
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.interpreter import StreamExecutor

sglang/lang/backend/base_backend.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Callable, List, Optional, Union
+from typing import List, Optional, Union
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod

sglang/lang/backend/openai.py CHANGED Viewed

@@ -2,7 +2,7 @@ import dataclasses
 import logging
 import time
 import warnings
-from typing import Callable, List, Optional, Union
+from typing import List, Optional, Union
 import numpy as np
@@ -161,7 +161,11 @@ class OpenAI(BaseBackend):
                 prompt = s.text_
             kwargs = sampling_params.to_openai_kwargs()
-            if self.model_name.startswith("o1") or self.model_name.startswith("o3"):
+            if (
+                self.model_name.startswith("o1")
+                or self.model_name.startswith("o3")
+                or "o1" in self.model_name
+            ):
                 kwargs.pop("max_tokens", None)
             else:
                 kwargs.pop("max_completion_tokens", None)

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -324,7 +324,11 @@ class RuntimeEndpoint(BaseBackend):
     def _assert_success(self, res):
         if res.status_code != 200:
-            raise RuntimeError(res.json())
+            try:
+                content = res.json()
+            except json.JSONDecodeError:
+                content = res.text
+            raise RuntimeError(content)
 def compute_normalized_prompt_logprobs(input_logprobs):

sglang/lang/backend/vertexai.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import warnings
-from typing import Optional
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template

sglang/lang/compiler.py CHANGED Viewed

@@ -5,13 +5,7 @@ from typing import List, Union
 from sglang.global_config import global_config
 from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
-from sglang.lang.ir import (
-    SglArgument,
-    SglConstantText,
-    SglExpr,
-    SglSamplingParams,
-    SglVariable,
-)
+from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable
 def compile_func(function, backend):

sglang/lang/tracer.py CHANGED Viewed

@@ -1,20 +1,16 @@
 """Tracing a program."""
 import uuid
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
-from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.interpreter import ProgramState, ProgramStateGroup
 from sglang.lang.ir import (
     SglArgument,
-    SglCommitLazy,
-    SglConcateAndAppend,
     SglConstantText,
     SglExpr,
     SglExprList,
     SglFork,
-    SglFunction,
     SglGen,
     SglGetForkItem,
     SglRoleBegin,
@@ -230,8 +226,8 @@ class TracerProgramState(ProgramState):
         self.cur_role = None
     def _execute_var_scope_end(self, expr: SglVarScopeEnd):
-        new_node = SglVariable(name, source=self.last_node)
-        self.variables[name] = new_node
+        new_node = SglVariable(expr.name, source=self.last_node)
+        self.variables[expr.name] = new_node
     def get_var(self, name):
         ret = self.arguments.get(name, None)

sglang/srt/_custom_ops.py CHANGED Viewed

@@ -1,10 +1,8 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
 import logging
-import os
 from typing import List, Tuple
 import torch
-import torch.library
 from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -73,8 +73,11 @@ class ModelConfig:
         )
         if enable_multimodal is None:
-            if self.hf_config.architectures == "Llama4ForConditionalGeneration":
+            if self.hf_config.architectures[0] == "Llama4ForConditionalGeneration":
                 enable_multimodal = False
+                logger.info(
+                    "Multimodal is disabled for Llama4. To enable it, set --enable-llama4-multimodal."
+                )
             else:
                 enable_multimodal = True

sglang/srt/constrained/outlines_jump_forward.py CHANGED Viewed

@@ -19,10 +19,13 @@ Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
 import dataclasses
 import logging
 from collections import defaultdict
+from typing import Optional
 import interegular
 from interegular import InvalidSyntax
-from outlines.caching import cache as disk_cache
+from outlines.caching import cache
+from sglang.srt.utils import get_bool_env_var
 try:
     # outlines >= 0.1.0
@@ -34,6 +37,9 @@ except ImportError:
 IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
+# Env var was set in sglang.srt.server_args.ServerArgs.__post__init__
+DISABLE_DISK_CACHE = get_bool_env_var("SGLANG_DISABLE_OUTLINES_DISK_CACHE", "true")
 logger = logging.getLogger(__name__)
@@ -45,6 +51,13 @@ class JumpEdge:
     byte_next_state: int = None
+def disk_cache(expire: Optional[float] = None, typed=False, ignore=()):
+    if not DISABLE_DISK_CACHE:
+        return cache(expire, typed, ignore)
+    else:
+        return lambda fn: None
 @disk_cache()
 def init_state_to_jump_forward(regex_string):
     try:

sglang/srt/constrained/triton_ops/bitmask_ops.py ADDED Viewed

@@ -0,0 +1,141 @@
+# Adapt from
+# https://github.com/mlc-ai/xgrammar/blob/v0.1.17/python/xgrammar/kernels/apply_token_bitmask_inplace_triton.py
+from typing import List, Optional, Union
+import torch
+import triton
+import triton.language as tl
+from sglang.srt.utils import get_device_core_count
+@triton.jit
+def apply_token_bitmask_inplace_kernel(
+    logits_ptr,
+    bitmask_ptr,
+    indices_ptr,
+    num_rows,
+    vocab_size,
+    logits_strides,
+    bitmask_strides,
+    NUM_SMS: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Apply a bitmask to logits in-place using Triton. The bitmask is a 01 bitwise compressed tensor,
+    where 0 means the token is masked and 1 means the token is not masked. After applying the bitmask,
+    the masked logits will be set to -inf.
+    Parameters
+    ----------
+    logits_ptr : tl.tensor
+        Pointer to the logits tensor to apply the bitmask to.
+    bitmask_ptr : tl.tensor
+        Pointer to the bitmask tensor to apply.
+    indices_ptr : Optional[tl.tensor]
+        Optional pointer to indices tensor specifying which rows to apply the mask to.
+    num_rows : int
+        Number of rows to process. If indices_ptr is provided, this is the number of unique indices.
+    vocab_size : int
+        Size of the vocabulary dimension. If the logits does not have a vocab padding, this is the
+        same as the logits's second dimension. Otherwise, this is the actual size of the vocabulary.
+    logits_strides : int
+        Stride between rows in the logits tensor.
+    bitmask_strides : int
+        Stride between rows in the bitmask tensor.
+    NUM_SMS : int
+        Number of streaming multiprocessors to use.
+    BLOCK_SIZE : int
+        Size of processing blocks.
+    """
+    pid = tl.program_id(0)
+    num_blocks = tl.cdiv(vocab_size, BLOCK_SIZE)
+    for work_id in tl.range(pid, num_rows * num_blocks, NUM_SMS):
+        row_id = work_id // num_blocks
+        block_offset = (work_id % num_blocks) * BLOCK_SIZE
+        batch_id = row_id if indices_ptr is None else tl.load(indices_ptr + row_id)
+        offsets = block_offset + tl.arange(0, BLOCK_SIZE)
+        bitmask_offsets = block_offset // 32 + tl.arange(0, BLOCK_SIZE // 32)
+        vocab_mask = offsets < vocab_size
+        packed_bitmask_mask = bitmask_offsets < bitmask_strides
+        packed_bitmask = tl.load(
+            bitmask_ptr + batch_id * bitmask_strides + bitmask_offsets,
+            packed_bitmask_mask,
+        )
+        bitmask = ((packed_bitmask[:, None] >> (tl.arange(0, 32)[None, :])) & 1) == 0
+        bitmask = bitmask.reshape(BLOCK_SIZE)
+        tl.store(
+            logits_ptr + batch_id * logits_strides + offsets,
+            -float("inf"),
+            vocab_mask & bitmask,
+        )
+def apply_token_bitmask_inplace_triton(
+    logits: torch.Tensor,
+    bitmask: torch.Tensor,
+    indices: Optional[Union[List[int], torch.Tensor]] = None,
+):
+    NUM_SMS = get_device_core_count()
+    BLOCK_SIZE = 4096
+    BITS_PER_BLOCK = 32
+    # Check input dtype
+    assert bitmask.dtype == torch.int32, "bitmask must be of type int32"
+    # Check input tensor shapes.
+    logits_shape = logits.shape
+    bitmask_shape = bitmask.shape
+    if logits.ndim == 1:
+        logits_shape = (1, logits_shape[0])
+    if bitmask.ndim == 1:
+        bitmask_shape = (1, bitmask_shape[0])
+    required_bitmask_width = (logits_shape[1] + BITS_PER_BLOCK - 1) // BITS_PER_BLOCK
+    assert required_bitmask_width >= bitmask_shape[1], (
+        f"Bitmask width too large: allow at most {required_bitmask_width} int32s for "
+        f"logits' width {logits_shape[1]}, but got {bitmask_shape[1]}"
+    )
+    vocab_size = min(logits_shape[1], bitmask_shape[1] * BITS_PER_BLOCK)
+    num_rows = None
+    if isinstance(indices, list) or isinstance(indices, torch.Tensor):
+        indices = torch.tensor(indices, dtype=torch.int32, device=logits.device)
+        num_rows = indices.shape[0]
+    else:
+        assert (
+            logits_shape[0] == bitmask_shape[0]
+        ), f"batch size mismatch: logits {logits_shape[0]} vs bitmask {bitmask_shape[0]}"
+        num_rows = logits_shape[0]
+    if NUM_SMS > 0:
+        grid = (NUM_SMS,)
+    else:
+        num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+        grid = (num_rows * num_blocks,)
+        NUM_SMS = triton.next_power_of_2(grid[0])
+    apply_token_bitmask_inplace_kernel[grid](
+        logits,
+        bitmask,
+        indices,
+        num_rows,
+        vocab_size,
+        logits_shape[1],
+        bitmask_shape[1],
+        NUM_SMS,
+        BLOCK_SIZE,
+        num_warps=BLOCK_SIZE // 32 // (16 // logits.element_size()),
+        num_stages=3,
+    )

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -25,13 +25,16 @@ from xgrammar import (
     StructuralTagItem,
     TokenizerInfo,
     allocate_token_bitmask,
-    apply_token_bitmask_inplace,
 )
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
     BaseGrammarObject,
 )
+from sglang.srt.constrained.triton_ops.bitmask_ops import (
+    apply_token_bitmask_inplace_triton,
+)
+from sglang.srt.utils import get_bool_env_var
 logger = logging.getLogger(__name__)
@@ -55,6 +58,18 @@ class XGrammarGrammar(BaseGrammarObject):
         self.override_stop_tokens = override_stop_tokens
         self.finished = False
+        # Fix (from vLLM team): postpone the import of apply_token_bitmask_inplace_kernels to the
+        # class init site to avoid re-initializing CUDA in forked subprocess.
+        from xgrammar.kernels import apply_token_bitmask_inplace_kernels
+        self.use_token_bitmask_triton = get_bool_env_var(
+            "SGLANG_TOKEN_BITMASK_TRITON", "false"
+        )
+        self.apply_vocab_mask_cuda = apply_token_bitmask_inplace_kernels.get(
+            "cuda", None
+        )
+        self.apply_vocab_mask_cpu = apply_token_bitmask_inplace_kernels.get("cpu", None)
     def accept_token(self, token: int):
         assert self.matcher.accept_token(token)
@@ -97,9 +112,16 @@ class XGrammarGrammar(BaseGrammarObject):
     def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
         return vocab_mask.to(device, non_blocking=True)
-    @staticmethod
-    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        apply_token_bitmask_inplace(logits, vocab_mask)
+    def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        if (
+            not self.use_token_bitmask_triton
+            and logits.device.type == "cuda"
+            and self.apply_vocab_mask_cuda
+        ):
+            return self.apply_vocab_mask_cuda(logits, vocab_mask)
+        if logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
+            return self.apply_vocab_mask_cpu(logits, vocab_mask)
+        apply_token_bitmask_inplace_triton(logits, vocab_mask)
     def copy(self):
         matcher = GrammarMatcher(
@@ -136,6 +158,7 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
     def dispatch_json(self, key_string: str) -> Optional[XGrammarGrammar]:
         try:
             if key_string == "$$ANY$$":
+                # Note: This builtin JSON grammar includes *all* valid JSON (including, for example, arrays at the root)
                 ctx = self.grammar_compiler.compile_builtin_json_grammar()
             else:
                 ctx = self.grammar_compiler.compile_json_schema(schema=key_string)

sglang/srt/custom_op.py CHANGED Viewed

@@ -42,65 +42,3 @@ class CustomOp(nn.Module):
             return self.forward_hip
         else:
             return self.forward_native
-if _is_cuda:
-    from sgl_kernel import sgl_per_tensor_quant_fp8, sgl_per_token_quant_fp8
-    def scaled_fp8_quant(
-        input: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        num_token_padding: Optional[int] = None,
-        use_per_token_if_dynamic: bool = False,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Quantize input tensor to FP8 (8-bit floating point) format.
-        Args:
-            input (torch.Tensor): Input tensor to be quantized
-            scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
-                If None, scales will be computed dynamically.
-            num_token_padding (Optional[int]): If specified, pad the first dimension
-                of the output to at least this value.
-            use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
-                determines the quantization granularity:
-                - True: compute scale per token
-                - False: compute single scale per tensor
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
-                - quantized_tensor: The FP8 quantized version of input
-                - scale_tensor: The scaling factors used for quantization
-        Raises:
-            AssertionError: If input is not 2D or if static scale's numel != 1
-        """
-        assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
-        shape = input.shape
-        out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
-        if num_token_padding:
-            shape = (max(num_token_padding, input.shape[0]), shape[1])
-        output = torch.empty(shape, device=input.device, dtype=out_dtype)
-        if scale is None:
-            # Dynamic scaling
-            if use_per_token_if_dynamic:
-                scale = torch.empty(
-                    (shape[0], 1), device=input.device, dtype=torch.float32
-                )
-                sgl_per_token_quant_fp8(input, output, scale)
-            else:
-                scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-                sgl_per_tensor_quant_fp8(
-                    input, output, scale, is_static=False
-                )  # False for dynamic
-        else:
-            # Static scaling
-            assert (
-                scale.numel() == 1
-            ), f"Expected scalar scale, got numel={scale.numel()}"
-            sgl_per_tensor_quant_fp8(
-                input, output, scale, is_static=True
-            )  # True for static
-        return output, scale

sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post1py3-none-any.whl → 0.4.5.post3py3-none-any.whl