PyPI - sglang - Versions diffs - 0.4.5__tar.gz → 0.4.5.post2__tar.gz - Mend

sglang 0.4.5tar.gz → 0.4.5.post2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (618) hide show

{sglang-0.4.5/sglang.egg-info → sglang-0.4.5.post2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.5
+Version: 0.4.5.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -239,20 +239,30 @@ Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
-Requires-Dist: transformers==4.51.0; extra == "runtime-common"
+Requires-Dist: transformers==4.51.1; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.8; extra == "srt"
+Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
 Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
 Requires-Dist: torch==2.5.1; extra == "srt"
+Requires-Dist: torchvision==0.20.1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
 Requires-Dist: partial_json_parser; extra == "srt"
 Requires-Dist: einops; extra == "srt"
+Provides-Extra: blackwell
+Requires-Dist: sglang[runtime_common]; extra == "blackwell"
+Requires-Dist: sgl-kernel; extra == "blackwell"
+Requires-Dist: torch; extra == "blackwell"
+Requires-Dist: torchvision; extra == "blackwell"
+Requires-Dist: cuda-python; extra == "blackwell"
+Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
+Requires-Dist: partial_json_parser; extra == "blackwell"
+Requires-Dist: einops; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -371,7 +381,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -391,7 +401,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 ## Adoption and Sponsorship
 The project has been deployed to large-scale production, generating trillions of tokens every day.
-It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
+It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>

{sglang-0.4.5 → sglang-0.4.5.post2}/README.md RENAMED Viewed

@@ -43,7 +43,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -63,7 +63,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
 ## Adoption and Sponsorship
 The project has been deployed to large-scale production, generating trillions of tokens every day.
-It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
+It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
 <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>

{sglang-0.4.5 → sglang-0.4.5.post2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.5"
+version = "0.4.5.post2"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -38,7 +38,7 @@ runtime_common = [
     "pyzmq>=25.1.2",
     "soundfile==0.13.1",
     "torchao>=0.7.0",
-    "transformers==4.51.0",
+    "transformers==4.51.1",
     "uvicorn",
     "uvloop",
     "compressed-tensors",
@@ -47,9 +47,21 @@ runtime_common = [
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.0.8",
+    "sgl-kernel==0.0.9.post2",
     "flashinfer_python==0.2.3",
     "torch==2.5.1",
+    "torchvision==0.20.1",
+    "cuda-python",
+    "outlines>=0.0.44,<=0.1.11",
+    "partial_json_parser",
+    "einops",
+]
+blackwell = [
+    "sglang[runtime_common]",
+    "sgl-kernel",
+    "torch",
+    "torchvision",
     "cuda-python",
     "outlines>=0.0.44,<=0.1.11",
     "partial_json_parser",

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/__init__.py RENAMED Viewed

@@ -24,6 +24,7 @@ from sglang.api import (
     user_end,
     video,
 )
+from sglang.global_config import global_config
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.choices import (
     greedy_token_selection,
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
     unconditional_likelihood_normalized,
 )
 from sglang.utils import LazyImport
+from sglang.version import __version__
 ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
 Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
@@ -38,10 +40,6 @@ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
 OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
 VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
-# Other configs
-from sglang.global_config import global_config
-from sglang.version import __version__
 __all__ = [
     "Engine",
     "Runtime",

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/bench_one_batch.py RENAMED Viewed

@@ -60,6 +60,7 @@ from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.scheduler import Scheduler
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -135,6 +136,7 @@ def load_model(server_args, port_args, tp_rank):
         context_length=server_args.context_length,
         model_override_args=server_args.json_model_override_args,
         is_embedding=server_args.is_embedding,
+        enable_multimodal=server_args.enable_multimodal,
         dtype=server_args.dtype,
         quantization=server_args.quantization,
     )
@@ -184,6 +186,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
         req.prefix_indices = []
         req.fill_ids = req.origin_input_ids
         req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
         reqs.append(req)
     return input_ids, reqs
@@ -199,11 +202,12 @@ def prepare_extend_inputs_for_correctness_test(
             i, : bench_args.cut_len
         ]
         req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
     return reqs
 def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
-    input_ids = np.ones((batch_size, input_len), dtype=np.int32)
+    input_ids = np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
     sampling_params = SamplingParams(
         temperature=0,
         max_new_tokens=BenchArgs.output_len,
@@ -220,6 +224,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
         req.prefix_indices = []
         req.fill_ids = req.origin_input_ids
         req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
         reqs.append(req)
     return reqs
@@ -238,6 +243,7 @@ def extend(reqs, model_runner):
         enable_custom_logit_processor=False,
     )
     batch.prepare_for_extend()
+    _maybe_prepare_dp_attn_batch(batch, model_runner)
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output = model_runner.forward(forward_batch)
@@ -249,6 +255,7 @@ def extend(reqs, model_runner):
 def decode(input_token_ids, batch, model_runner):
     batch.output_ids = input_token_ids
     batch.prepare_for_decode()
+    _maybe_prepare_dp_attn_batch(batch, model_runner)
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output = model_runner.forward(forward_batch)
@@ -256,6 +263,20 @@ def decode(input_token_ids, batch, model_runner):
     return next_token_ids, logits_output.next_token_logits
+def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
+    if model_runner.server_args.enable_dp_attention:
+        Scheduler.prepare_dp_attn_batch_raw(
+            batch,
+            dp_size=model_runner.server_args.dp_size,
+            attn_tp_size=1,
+            tp_cpu_group=model_runner.tp_group.cpu_group,
+            get_idle_batch=None,
+            disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
+            spec_algorithm=SpeculativeAlgorithm.NONE,
+            speculative_num_draft_tokens=None,
+        )
 def correctness_test(
     server_args,
     port_args,
@@ -375,7 +396,7 @@ def latency_test_run_once(
         decode_latencies.append(latency)
         if i < 5:
             rank_print(
-                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+                f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
     if profile:

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/bench_serving.py RENAMED Viewed

@@ -490,7 +490,7 @@ def get_dataset(args, tokenizer):
             prompt_suffix=args.prompt_suffix,
             apply_chat_template=args.apply_chat_template,
         )
-    elif args.dataset_name == "random":
+    elif args.dataset_name.startswith("random"):
         input_requests = sample_random_requests(
             input_len=args.random_input_len,
             output_len=args.random_output_len,
@@ -498,6 +498,7 @@ def get_dataset(args, tokenizer):
             range_ratio=args.random_range_ratio,
             tokenizer=tokenizer,
             dataset_path=args.dataset_path,
+            random_sample=args.dataset_name == "random",
         )
     elif args.dataset_name == "generated-shared-prefix":
         input_requests = sample_generated_shared_prefix_requests(
@@ -687,6 +688,7 @@ def sample_random_requests(
     range_ratio: float,
     tokenizer: PreTrainedTokenizerBase,
     dataset_path: str,
+    random_sample: bool = True,
 ) -> List[Tuple[str, int, int]]:
     input_lens = np.random.randint(
@@ -700,7 +702,7 @@ def sample_random_requests(
         size=num_prompts,
     )
-    if True:
+    if random_sample:
         # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
         # Download sharegpt if necessary
@@ -1223,7 +1225,7 @@ async def benchmark(
         output_file_name = args.output_file
     else:
         now = datetime.now().strftime("%m%d")
-        if args.dataset_name == "random":
+        if args.dataset_name.startswith("random"):
             output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
         else:
             output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1442,7 +1444,7 @@ if __name__ == "__main__":
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "random", "generated-shared-prefix"],
+        choices=["sharegpt", "random", "random-ids", "generated-shared-prefix"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument(

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/anthropic.py RENAMED Viewed

@@ -1,7 +1,3 @@
-from typing import List, Optional, Union
-import numpy as np
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.interpreter import StreamExecutor

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/base_backend.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Callable, List, Optional, Union
+from typing import List, Optional, Union
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/openai.py RENAMED Viewed

@@ -2,7 +2,7 @@ import dataclasses
 import logging
 import time
 import warnings
-from typing import Callable, List, Optional, Union
+from typing import List, Optional, Union
 import numpy as np

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/backend/vertexai.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import os
 import warnings
-from typing import Optional
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/compiler.py RENAMED Viewed

@@ -5,13 +5,7 @@ from typing import List, Union
 from sglang.global_config import global_config
 from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
-from sglang.lang.ir import (
-    SglArgument,
-    SglConstantText,
-    SglExpr,
-    SglSamplingParams,
-    SglVariable,
-)
+from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable
 def compile_func(function, backend):

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/lang/tracer.py RENAMED Viewed

@@ -1,20 +1,16 @@
 """Tracing a program."""
 import uuid
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
-from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.interpreter import ProgramState, ProgramStateGroup
 from sglang.lang.ir import (
     SglArgument,
-    SglCommitLazy,
-    SglConcateAndAppend,
     SglConstantText,
     SglExpr,
     SglExprList,
     SglFork,
-    SglFunction,
     SglGen,
     SglGetForkItem,
     SglRoleBegin,
@@ -230,8 +226,8 @@ class TracerProgramState(ProgramState):
         self.cur_role = None
     def _execute_var_scope_end(self, expr: SglVarScopeEnd):
-        new_node = SglVariable(name, source=self.last_node)
-        self.variables[name] = new_node
+        new_node = SglVariable(expr.name, source=self.last_node)
+        self.variables[expr.name] = new_node
     def get_var(self, name):
         ret = self.arguments.get(name, None)

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/_custom_ops.py RENAMED Viewed

@@ -1,10 +1,8 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
 import logging
-import os
 from typing import List, Tuple
 import torch
-import torch.library
 from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -15,6 +15,7 @@
 import json
 import logging
 import math
+import os
 from enum import IntEnum, auto
 from typing import List, Optional, Set, Union
@@ -42,10 +43,12 @@ class ModelConfig:
         context_length: Optional[int] = None,
         model_override_args: Optional[str] = None,
         is_embedding: Optional[bool] = None,
+        enable_multimodal: Optional[bool] = None,
         dtype: str = "auto",
         quantization: Optional[str] = None,
         override_config_file: Optional[str] = None,
     ) -> None:
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
@@ -69,14 +72,28 @@ class ModelConfig:
             self.hf_text_config, "attention_chunk_size", None
         )
+        if enable_multimodal is None:
+            if self.hf_config.architectures == "Llama4ForConditionalGeneration":
+                enable_multimodal = False
+            else:
+                enable_multimodal = True
         # Check model type
         self.is_generation = is_generation_model(
             self.hf_config.architectures, is_embedding
         )
-        self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
-        self.is_multimodal_gen = is_multimodal_gen_model(self.hf_config.architectures)
-        self.is_image_gen = is_image_gen_model(self.hf_config.architectures)
-        self.is_audio_model = is_audio_model(self.hf_config.architectures)
+        self.is_multimodal = enable_multimodal and is_multimodal_model(
+            self.hf_config.architectures
+        )
+        self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
+            self.hf_config.architectures
+        )
+        self.is_image_gen = enable_multimodal and is_image_gen_model(
+            self.hf_config.architectures
+        )
+        self.is_audio_model = enable_multimodal and is_audio_model(
+            self.hf_config.architectures
+        )
         self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
@@ -234,6 +251,20 @@ class ModelConfig:
         if quant_cfg is None:
             # compressed-tensors uses a "compression_config" key
             quant_cfg = getattr(self.hf_config, "compression_config", None)
+        if quant_cfg is None:
+            # check if is modelopt model -- modelopt doesn't have corresponding field
+            # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
+            # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
+            is_local = os.path.exists(self.model_path)
+            modelopt_quant_config = {"quant_method": "modelopt"}
+            if not is_local:
+                from huggingface_hub import HfApi
+                hf_api = HfApi()
+                if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
+                    quant_cfg = modelopt_quant_config
+            elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
+                quant_cfg = modelopt_quant_config
         return quant_cfg
     # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
@@ -264,6 +295,7 @@ class ModelConfig:
             "moe_wna16",
         ]
         compatible_quantization_methods = {
+            "modelopt_fp4": ["modelopt"],
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
             "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
         }
@@ -470,8 +502,8 @@ multimodal_model_archs = [
     "Gemma3ForConditionalGeneration",
     "Grok1VForCausalLM",
     "Grok1AForCausalLM",
-    # TODO: add multimodal support for "Llama4ForConditionalGeneration",
     "LlavaLlamaForCausalLM",
+    "Llama4ForConditionalGeneration",
     "LlavaMistralForCausalLM",
     "LlavaQwenForCausalLM",
     "LlavaVidForCausalLM",

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/base_grammar_backend.py RENAMED Viewed

@@ -28,6 +28,18 @@ logger = logging.getLogger(__name__)
 class BaseGrammarObject(ABC):
+    def __init__(self):
+        self._finished = False
+    @property
+    def finished(self):
+        return self._finished
+    @finished.setter
+    def finished(self, finished):
+        self._finished = finished
     @abstractmethod
     def try_jump_forward(self, tokenizer) -> Optional[Tuple[List[int], str]]:
         """
@@ -59,6 +71,13 @@ class BaseGrammarObject(ABC):
         """
         raise NotImplementedError
+    @abstractmethod
+    def accept_token(self, token: int) -> None:
+        """
+        Accept a token in the grammar.
+        """
+        raise NotImplementedError
     @abstractmethod
     def allocate_vocab_mask(
         self, vocab_size: int, batch_size: int, device
@@ -90,7 +109,7 @@ class CacheEntry:
     event: Event
-class BaseGrammarBackend(ABC):
+class BaseGrammarBackend:
     def __init__(self):
         self.executor = ThreadPoolExecutor()
         self.cache: Dict[Tuple[str, str], CacheEntry] = {}
@@ -107,19 +126,15 @@ class BaseGrammarBackend(ABC):
         """
         raise ValueError(f"Invalid key_type: {key_type}={key_string}")
-    @abstractmethod
     def dispatch_json(self, key_string: str) -> Optional[BaseGrammarObject]:
         return self._not_supported("json", key_string)
-    @abstractmethod
     def dispatch_regex(self, key_string: str) -> Optional[BaseGrammarObject]:
         return self._not_supported("regex", key_string)
-    @abstractmethod
     def dispatch_ebnf(self, key_string: str) -> Optional[BaseGrammarObject]:
         return self._not_supported("ebnf", key_string)
-    @abstractmethod
     def dispatch_structural_tag(self, key_string: str) -> Optional[BaseGrammarObject]:
         return self._not_supported("structural_tag", key_string)
@@ -195,4 +210,10 @@ def create_grammar_backend(
     else:
         raise ValueError(f"Invalid grammar backend: {server_args.grammar_backend}")
+    if server_args.reasoning_parser and hasattr(tokenizer, "think_end_id"):
+        from .reasoner_grammar_backend import ReasonerGrammarBackend
+        grammar_backend = ReasonerGrammarBackend(
+            grammar_backend, tokenizer.think_end_id
+        )
     return grammar_backend

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/llguidance_backend.py RENAMED Viewed

@@ -33,6 +33,7 @@ class GuidanceGrammar(BaseGrammarObject):
     def __init__(
         self, llguidance_tokenizer: llguidance.LLTokenizer, serialized_grammar: str
     ):
+        super().__init__()
         self.llguidance_tokenizer = llguidance_tokenizer
         self.serialized_grammar = serialized_grammar

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/outlines_backend.py RENAMED Viewed

@@ -44,6 +44,7 @@ class OutlinesGrammar(BaseGrammarObject):
         guide: RegexGuide,
         jump_forward_map: Union[OutlinesJumpForwardMap, None],
     ) -> None:
+        super().__init__()
         self.guide = guide
         self.jump_forward_map = jump_forward_map
         self.state = 0

{sglang-0.4.5 → sglang-0.4.5.post2}/sglang/srt/constrained/outlines_jump_forward.py RENAMED Viewed

@@ -19,10 +19,13 @@ Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
 import dataclasses
 import logging
 from collections import defaultdict
+from typing import Optional
 import interegular
 from interegular import InvalidSyntax
-from outlines.caching import cache as disk_cache
+from outlines.caching import cache
+from sglang.srt.utils import get_bool_env_var
 try:
     # outlines >= 0.1.0
@@ -34,6 +37,9 @@ except ImportError:
 IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
+# Env var was set in sglang.srt.server_args.ServerArgs.__post__init__
+DISABLE_DISK_CACHE = get_bool_env_var("SGLANG_DISABLE_OUTLINES_DISK_CACHE", "true")
 logger = logging.getLogger(__name__)
@@ -45,6 +51,13 @@ class JumpEdge:
     byte_next_state: int = None
+def disk_cache(expire: Optional[float] = None, typed=False, ignore=()):
+    if not DISABLE_DISK_CACHE:
+        return cache(expire, typed, ignore)
+    else:
+        return lambda fn: None
 @disk_cache()
 def init_state_to_jump_forward(regex_string):
     try:

sglang 0.4.5__tar.gz → 0.4.5.post2__tar.gz

sglang 0.4.5tar.gz → 0.4.5.post2tar.gz