PyPI - sglang - Versions diffs - 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

sglang/api.py +1 -1
sglang/bench_latency.py +3 -3
sglang/bench_server_latency.py +2 -3
sglang/bench_serving.py +92 -0
sglang/global_config.py +9 -3
sglang/lang/chat_template.py +50 -25
sglang/lang/interpreter.py +9 -1
sglang/lang/ir.py +11 -2
sglang/launch_server.py +1 -1
sglang/srt/configs/model_config.py +76 -15
sglang/srt/constrained/__init__.py +18 -0
sglang/srt/constrained/bnf_cache.py +61 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/constrained/grammar.py +190 -0
sglang/srt/hf_transformers_utils.py +20 -5
sglang/srt/layers/attention/flashinfer_backend.py +5 -5
sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
sglang/srt/layers/fused_moe/fused_moe.py +4 -3
sglang/srt/layers/fused_moe/layer.py +28 -0
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/quantization/base_config.py +16 -1
sglang/srt/layers/rotary_embedding.py +15 -48
sglang/srt/layers/sampler.py +51 -39
sglang/srt/layers/vocab_parallel_embedding.py +486 -0
sglang/srt/managers/data_parallel_controller.py +8 -7
sglang/srt/managers/detokenizer_manager.py +11 -9
sglang/srt/managers/image_processor.py +4 -3
sglang/srt/managers/io_struct.py +80 -78
sglang/srt/managers/schedule_batch.py +46 -52
sglang/srt/managers/schedule_policy.py +24 -13
sglang/srt/managers/scheduler.py +145 -82
sglang/srt/managers/tokenizer_manager.py +236 -334
sglang/srt/managers/tp_worker.py +5 -5
sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
sglang/srt/mem_cache/flush_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +10 -3
sglang/srt/model_executor/cuda_graph_runner.py +34 -23
sglang/srt/model_executor/forward_batch_info.py +6 -9
sglang/srt/model_executor/model_runner.py +10 -19
sglang/srt/models/baichuan.py +4 -4
sglang/srt/models/chatglm.py +4 -4
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +5 -5
sglang/srt/models/deepseek.py +4 -4
sglang/srt/models/deepseek_v2.py +4 -4
sglang/srt/models/exaone.py +4 -4
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gpt2.py +287 -0
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +4 -4
sglang/srt/models/internlm2.py +4 -4
sglang/srt/models/llama.py +15 -7
sglang/srt/models/llama_embedding.py +2 -10
sglang/srt/models/llama_reward.py +5 -0
sglang/srt/models/minicpm.py +4 -4
sglang/srt/models/minicpm3.py +4 -4
sglang/srt/models/mixtral.py +7 -5
sglang/srt/models/mixtral_quant.py +4 -4
sglang/srt/models/mllama.py +5 -5
sglang/srt/models/olmo.py +4 -4
sglang/srt/models/olmoe.py +4 -4
sglang/srt/models/qwen.py +4 -4
sglang/srt/models/qwen2.py +4 -4
sglang/srt/models/qwen2_moe.py +4 -4
sglang/srt/models/qwen2_vl.py +4 -8
sglang/srt/models/stablelm.py +4 -4
sglang/srt/models/torch_native_llama.py +4 -4
sglang/srt/models/xverse.py +4 -4
sglang/srt/models/xverse_moe.py +4 -4
sglang/srt/openai_api/adapter.py +52 -66
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +7 -13
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +41 -33
sglang/srt/server_args.py +34 -5
sglang/srt/utils.py +40 -56
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +2 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +151 -6
sglang/utils.py +62 -1
sglang/version.py +1 -1
sglang-0.3.5.dist-info/METADATA +344 -0
sglang-0.3.5.dist-info/RECORD +152 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
sglang-0.3.4.post1.dist-info/METADATA +0 -900
sglang-0.3.4.post1.dist-info/RECORD +0 -148
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0

sglang/api.py CHANGED Viewed

@@ -99,7 +99,7 @@ def gen(
     regex: Optional[str] = None,
     json_schema: Optional[str] = None,
 ):
-    """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
+    """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
     if choices:
         return SglSelect(

sglang/bench_latency.py CHANGED Viewed

@@ -129,9 +129,9 @@ def load_model(server_args, port_args, tp_rank):
     model_config = ModelConfig(
         server_args.model_path,
-        server_args.trust_remote_code,
+        trust_remote_code=server_args.trust_remote_code,
         context_length=server_args.context_length,
-        model_override_args=json.loads(server_args.json_model_override_args),
+        model_override_args=server_args.json_model_override_args,
     )
     model_runner = ModelRunner(
         model_config=model_config,
@@ -550,4 +550,4 @@ if __name__ == "__main__":
     except Exception as e:
         raise e
     finally:
-        kill_child_process(os.getpid(), including_parent=False)
+        kill_child_process()

sglang/bench_server_latency.py CHANGED Viewed

@@ -15,7 +15,6 @@ import dataclasses
 import itertools
 import json
 import multiprocessing
-import os
 import time
 from typing import Tuple
@@ -70,7 +69,7 @@ def launch_server_internal(server_args):
     except Exception as e:
         raise e
     finally:
-        kill_child_process(os.getpid(), including_parent=False)
+        kill_child_process()
 def launch_server_process(server_args: ServerArgs):
@@ -176,7 +175,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             )
     finally:
         if proc:
-            kill_child_process(proc.pid)
+            kill_child_process(proc.pid, include_self=True)
     print(f"\nResults are saved to {bench_args.result_filename}")

sglang/bench_serving.py CHANGED Viewed

@@ -222,6 +222,85 @@ async def async_request_openai_completions(
     return output
+async def async_request_truss(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    prompt = request_func_input.prompt
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": prompt,
+            "temperature": 0.0,
+            "best_of": 1,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
+        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["choices"][0]["delta"]["content"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["delta"]["content"]
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
 async def async_request_sglang_generate(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -350,6 +429,7 @@ ASYNC_REQUEST_FUNCS = {
     "lmdeploy": async_request_openai_completions,
     "trt": async_request_trt_llm,
     "gserver": async_request_gserver,
+    "truss": async_request_truss,
 }
@@ -873,6 +953,7 @@ def run_benchmark(args_: argparse.Namespace):
             "vllm": 8000,
             "trt": 8000,
             "gserver": 9988,
+            "truss": 8080,
         }.get(args.backend, 30000)
     model_url = (
@@ -905,9 +986,20 @@ def run_benchmark(args_: argparse.Namespace):
     elif args.backend == "gserver":
         api_url = args.base_url if args.base_url else f"{args.host}:{args.port}"
         args.model = args.model or "default"
+    elif args.backend == "truss":
+        api_url = (
+            f"{args.base_url}/v1/models/model:predict"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/models/model:predict"
+        )
     # Get model name
     if args.model is None:
+        if args.backend == "truss":
+            print(
+                "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct"
+            )
+            sys.exit(1)
         try:
             response = requests.get(model_url)
             model_list = response.json().get("data", [])

sglang/global_config.py CHANGED Viewed

@@ -14,9 +14,15 @@ class GlobalConfig:
         self.default_backend = None
         # Runtime constants: New generation token ratio estimation
-        self.init_new_token_ratio = 0.7
-        self.base_min_new_token_ratio = 0.1
-        self.new_token_ratio_decay = 0.001
+        self.default_init_new_token_ratio = float(
+            os.environ.get("SGLANG_INIT_NEW_TOKEN_RATIO", 0.7)
+        )
+        self.default_min_new_token_ratio_factor = float(
+            os.environ.get("SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR", 0.14)
+        )
+        self.default_new_token_ratio_decay_steps = float(
+            os.environ.get("SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS", 600)
+        )
         # Runtime constants: others
         self.retract_decode_steps = 20

sglang/lang/chat_template.py CHANGED Viewed

@@ -116,12 +116,10 @@ register_chat_template(
     )
 )
-# There is default system prompt for qwen
-# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
-# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
 register_chat_template(
     ChatTemplate(
-        name="qwen",
+        name="chatml-llava",
         default_system_prompt="You are a helpful assistant.",
         role_prefix_and_suffix={
             "system": ("<|im_start|>system\n", "<|im_end|>\n"),
@@ -130,13 +128,17 @@ register_chat_template(
         },
         style=ChatTemplateStyle.PLAIN,
         stop_str=("<|im_end|>",),
+        image_token="<image>\n",
     )
 )
-# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+# There is default system prompt for qwen
+# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
+# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
 register_chat_template(
     ChatTemplate(
-        name="qwen2-vl",
+        name="qwen",
         default_system_prompt="You are a helpful assistant.",
         role_prefix_and_suffix={
             "system": ("<|im_start|>system\n", "<|im_end|>\n"),
@@ -144,15 +146,14 @@ register_chat_template(
             "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
         },
         style=ChatTemplateStyle.PLAIN,
-        stop_str=("<|im_end|>"),
-        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+        stop_str=("<|im_end|>",),
     )
 )
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
 register_chat_template(
     ChatTemplate(
-        name="chatml-llava",
+        name="qwen2-vl",
         default_system_prompt="You are a helpful assistant.",
         role_prefix_and_suffix={
             "system": ("<|im_start|>system\n", "<|im_end|>\n"),
@@ -161,7 +162,7 @@ register_chat_template(
         },
         style=ChatTemplateStyle.PLAIN,
         stop_str=("<|im_end|>",),
-        image_token="<image>\n",
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
     )
 )
@@ -182,37 +183,46 @@ register_chat_template(
     )
 )
-# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
 register_chat_template(
     ChatTemplate(
-        name="yi-1.5",
+        name="llama-2-chat",
         default_system_prompt=None,
         role_prefix_and_suffix={
-            "system": ("", ""),
-            "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
-            "assistant": ("", "<|im_end|>\n"),
+            "system": ("<<SYS>>\n", "\n<</SYS>>\n\n"),
+            "user": ("[INST] ", " [/INST]"),
+            "assistant": ("", " </s><s>"),
         },
-        style=ChatTemplateStyle.PLAIN,
-        stop_str=("<|im_end|>",),
+        style=ChatTemplateStyle.LLAMA2,
     )
 )
 register_chat_template(
     ChatTemplate(
-        name="llama-2-chat",
+        name="llama-3-instruct",
         default_system_prompt=None,
         role_prefix_and_suffix={
-            "system": ("<<SYS>>\n", "\n<</SYS>>\n\n"),
-            "user": ("[INST] ", " [/INST]"),
-            "assistant": ("", " </s><s>"),
+            "system": (
+                "<|start_header_id|>system<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "user": (
+                "<|start_header_id|>user<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "assistant": (
+                "<|start_header_id|>assistant<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
         },
-        style=ChatTemplateStyle.LLAMA2,
+        stop_str=("<|eot_id|>",),
+        image_token="<|image|>",
     )
 )
+# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
 register_chat_template(
     ChatTemplate(
-        name="llama-3-instruct",
+        name="llama-3-instruct-llava",
         default_system_prompt=None,
         role_prefix_and_suffix={
             "system": (
@@ -229,7 +239,22 @@ register_chat_template(
             ),
         },
         stop_str=("<|eot_id|>",),
-        image_token="<|image|>",
+        image_token="<image>\n",
+    )
+)
+# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
+register_chat_template(
+    ChatTemplate(
+        name="yi-1.5",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
+            "assistant": ("", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
     )
 )

sglang/lang/interpreter.py CHANGED Viewed

@@ -54,7 +54,14 @@ def run_internal(state, program, func_args, func_kwargs, sync):
 def run_program(
-    program, backend, func_args, func_kwargs, default_sampling_para, stream, sync=False
+    program,
+    backend,
+    func_args,
+    func_kwargs,
+    default_sampling_para,
+    stream,
+    sync=False,
+    use_thread=True,
 ):
     if hasattr(backend, "endpoint"):
         backend = backend.endpoint
@@ -67,6 +74,7 @@ def run_program(
         chat_template=None,
         stream=stream,
         num_api_spec_tokens=program.num_api_spec_tokens,
+        use_thread=use_thread,
     )
     state = ProgramState(stream_executor)

sglang/lang/ir.py CHANGED Viewed

@@ -168,6 +168,7 @@ class SglFunction:
         return_text_in_logprobs: Optional[bool] = None,
         stream: bool = False,
         backend=None,
+        use_thread: bool = True,
         **kwargs,
     ):
         from sglang.lang.interpreter import run_program
@@ -195,7 +196,15 @@ class SglFunction:
             return_text_in_logprobs=return_text_in_logprobs,
         )
         backend = backend or global_config.default_backend
-        return run_program(self, backend, args, kwargs, default_sampling_para, stream)
+        return run_program(
+            self,
+            backend,
+            args,
+            kwargs,
+            default_sampling_para,
+            stream,
+            use_thread=use_thread,
+        )
     def run_batch(
         self,
@@ -445,7 +454,7 @@ class SglGen(SglExpr):
         regex: Optional[str] = None,
         json_schema: Optional[str] = None,
     ):
-        """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
+        """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
         super().__init__()
         self.name = name
         self.sampling_params = SglSamplingParams(

sglang/launch_server.py CHANGED Viewed

@@ -15,4 +15,4 @@ if __name__ == "__main__":
     except Exception as e:
         raise e
     finally:
-        kill_child_process(os.getpid(), including_parent=False)
+        kill_child_process()

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -13,13 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+import json
+import logging
+import os
 from enum import IntEnum, auto
-from typing import Optional
+from typing import List, Optional
 from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
+logger = logging.getLogger(__name__)
 class AttentionArch(IntEnum):
     MLA = auto()
@@ -34,22 +39,47 @@ class ModelConfig:
         revision: Optional[str] = None,
         context_length: Optional[int] = None,
         model_override_args: Optional[dict] = None,
+        is_embedding: Optional[bool] = None
     ) -> None:
-        self.path = path
-        self.trust_remote_code = trust_remote_code
-        self.revision = revision
-        self.model_override_args = model_override_args
+        # Parse args
+        self.model_override_args = json.loads(model_override_args)
         self.hf_config = get_config(
-            self.path,
-            trust_remote_code,
-            revision,
-            model_override_args=model_override_args,
+            path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            model_override_args=self.model_override_args,
         )
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        # Check model type
+        self.is_generation = is_generation_model(self.hf_config.architectures, is_embedding)
+        self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
+        self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
+        # Derive context length
+        derived_context_len = get_context_length(self.hf_text_config)
+        allow_long_context = os.environ.get(
+            "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
+        )
         if context_length is not None:
-            self.context_len = context_length
+            if context_length > derived_context_len:
+                if allow_long_context:
+                    logger.warning(
+                        f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                        f"This may lead to incorrect model outputs or CUDA errors."
+                    )
+                    self.context_len = context_length
+                else:
+                    raise ValueError(
+                        f"User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                        f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config. "
+                        f"To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
+                    )
+            else:
+                self.context_len = context_length
         else:
-            self.context_len = get_context_length(self.hf_text_config)
+            self.context_len = derived_context_len
         # Unify the config keys for hf_text_config
         self.head_dim = getattr(
@@ -58,7 +88,7 @@ class ModelConfig:
             self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads,
         )
-        # FIXME: temporary special judge for deepseek v2 MLA architecture
+        # FIXME: temporary special judge for MLA architecture
         if "DeepseekV2ForCausalLM" in self.hf_config.architectures:
             self.head_dim = 256
             self.attention_arch = AttentionArch.MLA
@@ -89,8 +119,6 @@ class ModelConfig:
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
         self.vocab_size = self.hf_text_config.vocab_size
-        self.is_encoder_decoder = self.hf_config.model_type in ["mllama"]
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""
@@ -140,7 +168,6 @@ class ModelConfig:
         # equal to the number of attention heads.
         return self.hf_text_config.num_attention_heads
-    # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L328
     def get_num_kv_heads(self, tensor_parallel_size) -> int:
         """Returns the number of KV heads per GPU."""
         total_num_kv_heads = self.get_total_num_kv_heads()
@@ -169,3 +196,37 @@ def get_hf_text_config(config: PretrainedConfig):
         return config.text_config
     else:
         return config
+def is_generation_model(model_architectures: List[str], is_embedding: bool = False):
+    # We have two ways to determine whether a model is a generative model.
+    # 1. Check the model architectue
+    # 2. check the `is_embedding` server args
+    if (
+        "LlamaEmbeddingModel" in model_architectures
+        or "MistralModel" in model_architectures
+        or "LlamaForSequenceClassification" in model_architectures
+        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
+    ):
+        return False
+    else:
+        return not is_embedding
+def is_multimodal_model(model_architectures: List[str]):
+    if (
+        "LlavaLlamaForCausalLM" in model_architectures
+        or "LlavaQwenForCausalLM" in model_architectures
+        or "LlavaMistralForCausalLM" in model_architectures
+        or "LlavaVidForCausalLM" in model_architectures
+        or "MllamaForConditionalGeneration" in model_architectures
+        or "Qwen2VLForConditionalGeneration" in model_architectures
+    ):
+        return True
+    else:
+        return False
+def is_encoder_decoder_model(model_architectures: List[str]):
+    return "MllamaForConditionalGeneration" in model_architectures

sglang/srt/constrained/__init__.py CHANGED Viewed

@@ -51,6 +51,21 @@ except ImportError:
         return build_regex_from_schema(schema, whitespace_pattern)
+try:
+    from xgrammar import (
+        GrammarMatcher,
+        GrammarMatcherInitContext,
+        GrammarMatcherInitContextCache,
+    )
+except ImportError as e:
+    class Dummy:
+        pass
+    GrammarMatcher = Dummy
+    GrammarMatcherInitContext = Dummy
+    GrammarMatcherInitContextCache = Dummy
 __all__ = [
     "RegexGuide",
     "FSMInfo",
@@ -60,4 +75,7 @@ __all__ = [
     "disk_cache",
     "disable_cache",
     "make_byte_level_fsm",
+    "GrammarMatcher",
+    "GrammarMatcherInitContext",
+    "GrammarMatcherInitContextCache",
 ]

sglang/srt/constrained/bnf_cache.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Cache for the compressed finite state machine."""
+from typing import Tuple
+from transformers import AutoTokenizer
+from sglang.srt.constrained import (
+    GrammarMatcher,
+    GrammarMatcherInitContext,
+    GrammarMatcherInitContextCache,
+)
+MAX_ROLLBACK_TOKENS = 10
+class BNFCache:
+    grammar_cache: GrammarMatcherInitContextCache
+    def __init__(
+        self,
+        tokenizer_path,
+        tokenizer_args_dict,
+        skip_tokenizer_init=False,
+        whitespace_patterns=None,
+    ):
+        # TODO(dark): how to deal with whitespace_patterns and skip_tokenizer_init
+        if skip_tokenizer_init:
+            return
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, **tokenizer_args_dict)
+        self.grammar_cache = GrammarMatcherInitContextCache(
+            tokenizer_or_vocab=tokenizer
+        )
+    def get_context(self, key: Tuple[str, str]) -> GrammarMatcherInitContext:
+        key_type, key_string = key
+        if key_type == "json":
+            return self.grammar_cache.get_init_context_for_json_schema(key_string)
+        elif key_type == "regex":
+            raise ValueError(f"regex hasn't been supported by xgrammar yet")
+        else:
+            raise ValueError(f"Invalid key_type: {key_type}")
+    def query(self, key: Tuple[str, str], vocab_size: int) -> GrammarMatcher:
+        ctx = self.get_context(key)
+        return GrammarMatcher(
+            ctx, max_rollback_tokens=MAX_ROLLBACK_TOKENS, mask_vocab_size=vocab_size
+        )

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -73,9 +73,16 @@ class FSMCache(BaseToolCache):
     def init_value(self, key):
         key_type, key_string = key
         if key_type == "json":
-            regex = build_regex_from_schema(
-                key_string, whitespace_pattern=self.constrained_json_whitespace_pattern
-            )
+            try:
+                regex = build_regex_from_schema(
+                    key_string,
+                    whitespace_pattern=self.constrained_json_whitespace_pattern,
+                )
+            except NotImplementedError as e:
+                logger.warning(
+                    f"skip invalid json schema: json_schema={key_string}, {e=}"
+                )
+                return None, key_string
         elif key_type == "regex":
             regex = key_string
         else:

sglang 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl