PyPI - sglang - Versions diffs - 0.4.4.post1__tar.gz → 0.4.4.post3__tar.gz - Mend

sglang 0.4.4.post1tar.gz → 0.4.4.post3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (586) hide show

{sglang-0.4.4.post1/sglang.egg-info → sglang-0.4.4.post3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.4.post1
+Version: 0.4.4.post3
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -218,6 +218,7 @@ Requires-Dist: numpy
 Requires-Dist: IPython
 Requires-Dist: setproctitle
 Provides-Extra: runtime-common
+Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: datasets; extra == "runtime-common"
 Requires-Dist: decord; extra == "runtime-common"
 Requires-Dist: fastapi; extra == "runtime-common"
@@ -235,19 +236,22 @@ Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
+Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
-Requires-Dist: transformers==4.48.3; extra == "runtime-common"
+Requires-Dist: transformers==4.50.0; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.15; extra == "runtime-common"
+Requires-Dist: compressed-tensors; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.5; extra == "srt"
+Requires-Dist: sgl-kernel==0.0.5.post4; extra == "srt"
 Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
 Requires-Dist: torch==2.5.1; extra == "srt"
-Requires-Dist: vllm<=0.7.2,>=0.6.4.post1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"
 Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
+Requires-Dist: partial_json_parser; extra == "srt"
+Requires-Dist: einops; extra == "srt"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
@@ -271,7 +275,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
 Provides-Extra: litellm
 Requires-Dist: litellm>=1.0.0; extra == "litellm"
 Provides-Extra: torch-memory-saver
-Requires-Dist: torch_memory_saver; extra == "torch-memory-saver"
+Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
 Provides-Extra: test
 Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
@@ -319,6 +323,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
 Provides-Extra: dev-cpu
 Requires-Dist: sglang[all_cpu]; extra == "dev-cpu"
 Requires-Dist: sglang[test]; extra == "dev-cpu"
+Dynamic: license-file
 <div align="center" id="sglangtop">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -342,6 +347,9 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
+- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
+- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
 - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -362,7 +370,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/README.md RENAMED Viewed

@@ -20,6 +20,9 @@
 | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
 ## News
+- [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
+- [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
+- [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
 - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -40,7 +43,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.4.post1"
+version = "0.4.4.post3"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -17,6 +17,7 @@ dependencies = ["aiohttp", "requests", "tqdm", "numpy", "IPython", "setproctitle
 [project.optional-dependencies]
 runtime_common = [
+    "compressed-tensors",
     "datasets",
     "decord",
     "fastapi",
@@ -34,26 +35,34 @@ runtime_common = [
     "pydantic",
     "python-multipart",
     "pyzmq>=25.1.2",
+    "soundfile==0.13.1",
     "torchao>=0.7.0",
-    "transformers==4.48.3",
+    "transformers==4.50.0",
     "uvicorn",
     "uvloop",
-    "xgrammar==0.1.15",
+    "compressed-tensors",
+    "xgrammar==0.1.17",
 ]
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.0.5",
+    "sgl-kernel==0.0.5.post4",
     "flashinfer_python==0.2.3",
     "torch==2.5.1",
-    "vllm>=0.6.4.post1,<=0.7.2",
     "cuda-python",
     "outlines>=0.0.44,<=0.1.11",
+    "partial_json_parser",
+    "einops",
 ]
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20250114, not from public vllm whl
-srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.7.dev2", "outlines==0.1.11"]
+srt_hip = [
+    "sglang[runtime_common]",
+    "torch",
+    "vllm==0.6.7.dev2",
+    "outlines==0.1.11"
+]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
@@ -71,7 +80,7 @@ srt_cpu = ["sglang[runtime_common]", "outlines>=0.0.44,<=0.1.11", "torch"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
-torch_memory_saver = ["torch_memory_saver"]
+torch_memory_saver = ["torch_memory_saver>=0.0.4"]
 test = [
     "jsonlines",
     "matplotlib",

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/__init__.py RENAMED Viewed

@@ -32,6 +32,7 @@ from sglang.lang.choices import (
 )
 from sglang.utils import LazyImport
+ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
 Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
 LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
 OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
@@ -67,6 +68,7 @@ __all__ = [
     "greedy_token_selection",
     "token_length_normalized",
     "unconditional_likelihood_normalized",
+    "ServerArgs",
     "Anthropic",
     "LiteLLM",
     "OpenAI",

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/api.py RENAMED Viewed

@@ -75,6 +75,7 @@ def gen(
     name: Optional[str] = None,
     max_tokens: Optional[int] = None,
     min_tokens: Optional[int] = None,
+    n: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
     stop_token_ids: Optional[List[int]] = None,
     temperature: Optional[float] = None,
@@ -115,6 +116,7 @@ def gen(
         name,
         max_tokens,
         min_tokens,
+        n,
         stop,
         stop_token_ids,
         temperature,
@@ -137,6 +139,7 @@ def gen(
 def gen_int(
     name: Optional[str] = None,
     max_tokens: Optional[int] = None,
+    n: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
     stop_token_ids: Optional[List[int]] = None,
     temperature: Optional[float] = None,
@@ -155,6 +158,7 @@ def gen_int(
         name,
         max_tokens,
         None,
+        n,
         stop,
         stop_token_ids,
         temperature,
@@ -176,6 +180,7 @@ def gen_int(
 def gen_string(
     name: Optional[str] = None,
     max_tokens: Optional[int] = None,
+    n: Optional[int] = None,
     stop: Optional[Union[str, List[str]]] = None,
     stop_token_ids: Optional[List[int]] = None,
     temperature: Optional[float] = None,
@@ -194,6 +199,7 @@ def gen_string(
         name,
         max_tokens,
         None,
+        n,
         stop,
         stop_token_ids,
         temperature,

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/bench_one_batch.py RENAMED Viewed

@@ -117,7 +117,7 @@ class BenchArgs:
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
-        # use the default value's type to case the args into correct types.
+        # use the default value's type to cast the args into correct types.
         attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
         return cls(
             **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/bench_one_batch_server.py RENAMED Viewed

@@ -57,7 +57,7 @@ class BenchArgs:
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
-        # use the default value's type to case the args into correct types.
+        # use the default value's type to cast the args into correct types.
         attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
         return cls(
             **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/bench_serving.py RENAMED Viewed

@@ -128,7 +128,7 @@ async def async_request_trt_llm(
                         timestamp = time.perf_counter()
                         # First token
                         if ttft == 0.0:
-                            ttft = time.perf_counter() - st
+                            ttft = timestamp - st
                             output.ttft = ttft
                         # Decoding phase
@@ -501,6 +501,7 @@ def get_dataset(args, tokenizer):
             question_len=args.gsp_question_len,
             output_len=args.gsp_output_len,
             tokenizer=tokenizer,
+            args=args,
         )
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
@@ -788,6 +789,7 @@ def sample_generated_shared_prefix_requests(
     question_len: int,
     output_len: int,
     tokenizer: PreTrainedTokenizerBase,
+    args: argparse.Namespace,
 ) -> List[Tuple[str, int, int]]:
     """Generate benchmark requests with shared system prompts using random tokens and caching."""
     cache_path = get_gen_prefix_cache_path(args, tokenizer)
@@ -963,7 +965,7 @@ async def benchmark(
     request_rate: float,
     max_concurrency: Optional[int],
     disable_tqdm: bool,
-    lora_name: str,
+    lora_names: List[str],
     extra_request_body: Dict[str, Any],
     profile: bool,
     pd_seperated: bool = False,
@@ -986,6 +988,11 @@ async def benchmark(
     # Warmup
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    if lora_names != None and len(lora_names) != 0:
+        lora_name = lora_names[0]
+    else:
+        lora_name = None
     test_input = RequestFuncInput(
         model=model_id,
         prompt=test_prompt,
@@ -1026,6 +1033,12 @@ async def benchmark(
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
+        if lora_names != None and len(lora_names) != 0:
+            idx = random.randint(0, len(lora_names) - 1)
+            lora_name = lora_names[idx]
+        else:
+            lora_name = None
         request_func_input = RequestFuncInput(
             model=model_id,
             prompt=prompt,
@@ -1345,7 +1358,7 @@ def run_benchmark(args_: argparse.Namespace):
             request_rate=args.request_rate,
             max_concurrency=args.max_concurrency,
             disable_tqdm=args.disable_tqdm,
-            lora_name=args.lora_name,
+            lora_names=args.lora_name,
             extra_request_body=extra_request_body,
             profile=args.profile,
             pd_seperated=args.pd_seperated,
@@ -1364,6 +1377,13 @@ def set_ulimit(target_soft_limit=65535):
             print(f"Fail to set RLIMIT_NOFILE: {e}")
+class LoRAPathAction(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, [])
+        for lora_name in values:
+            getattr(namespace, self.dest).append(lora_name)
 if __name__ == "__main__":
     parser = ArgumentParser(description="Benchmark the online serving throughput.")
     parser.add_argument(
@@ -1507,8 +1527,10 @@ if __name__ == "__main__":
     parser.add_argument(
         "--lora-name",
         type=str,
+        nargs="*",
         default=None,
-        help="The name of LoRA adapter",
+        action=LoRAPathAction,
+        help="The names of LoRA adapters. You can provide a list of names in the format {name} {name} {name}...",
     )
     parser.add_argument(
         "--prompt-suffix",

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/check_env.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """Check environment configurations and dependency versions."""
-import importlib
+import importlib.metadata
 import os
 import resource
 import subprocess
@@ -59,9 +59,8 @@ def get_package_versions(packages):
     for package in packages:
         package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
         try:
-            module = importlib.import_module(package_name)
-            if hasattr(module, "__version__"):
-                versions[package_name] = module.__version__
+            version = importlib.metadata.version(package_name)
+            versions[package_name] = version
         except ModuleNotFoundError:
             versions[package_name] = "Module Not Found"
     return versions

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/backend/openai.py RENAMED Viewed

@@ -165,6 +165,7 @@ class OpenAI(BaseBackend):
                 kwargs.pop("max_tokens", None)
             else:
                 kwargs.pop("max_completion_tokens", None)
             comp = openai_completion(
                 client=self.client,
                 token_usage=self.token_usage,
@@ -173,13 +174,13 @@ class OpenAI(BaseBackend):
                 prompt=prompt,
                 **kwargs,
             )
+            # Keep the returned list (or string) as is.
         elif sampling_params.dtype in [str, "str", "string"]:
             assert (
                 not self.is_chat_model
             ), "constrained type not supported on chat model"
             kwargs = sampling_params.to_openai_kwargs()
             kwargs.pop("stop")
             comp = openai_completion(
                 client=self.client,
                 token_usage=self.token_usage,
@@ -189,7 +190,11 @@ class OpenAI(BaseBackend):
                 stop='"',
                 **kwargs,
             )
-            comp = '"' + comp + '"'
+            # Wrap each element in quotes if we have a list.
+            if isinstance(comp, list):
+                comp = ['"' + x + '"' for x in comp]
+            else:
+                comp = '"' + comp + '"'
         elif sampling_params.dtype in [int, "int"]:
             assert (
                 not self.is_chat_model
@@ -206,6 +211,7 @@ class OpenAI(BaseBackend):
                 stop=[" "],
                 **kwargs,
             )
+            # Leave as a list if that's what is returned.
         else:
             raise ValueError(f"Unknown dtype: {sampling_params.dtype}")
@@ -254,7 +260,9 @@ class OpenAI(BaseBackend):
                     prompt=s.messages_,
                     **self.spec_kwargs,
                 )
-                if self.spec_pattern_match(comp):
+                # Use a string for pattern matching.
+                comp_for_match = comp[0] if isinstance(comp, list) else comp
+                if self.spec_pattern_match(comp_for_match):
                     break
         for term in self.spec_format:
@@ -370,7 +378,7 @@ class OpenAI(BaseBackend):
 def openai_completion(
     client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
-):
+) -> Union[str, List[str]]:
     # if "ebnf" is in kwargs, warn and remove
     if "ebnf" in kwargs:
         warnings.warn("EBNF is not officially supported by OpenAI endpoints. Ignoring.")
@@ -382,13 +390,18 @@ def openai_completion(
                 if "stop" in kwargs and kwargs["stop"] is None:
                     kwargs.pop("stop")
                 ret = client.chat.completions.create(messages=prompt, **kwargs)
-                comp = ret.choices[0].message.content
+                if len(ret.choices) == 1:
+                    comp = ret.choices[0].message.content
+                else:
+                    comp = [c.message.content for c in ret.choices]
             else:
                 ret = client.completions.create(prompt=prompt, **kwargs)
                 if isinstance(prompt, (list, tuple)):
                     comp = [c.text for c in ret.choices]
                 else:
                     comp = ret.choices[0].text
+                    if len(ret.choices) > 1:
+                        comp = [c.text for c in ret.choices]
             token_usage.prompt_tokens += ret.usage.prompt_tokens
             token_usage.completion_tokens += ret.usage.completion_tokens

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/chat_template.py RENAMED Viewed

@@ -15,6 +15,7 @@ class ChatTemplate:
     role_prefix_and_suffix: Dict[str, Tuple[str, str]]
     stop_str: List[str] = ()
     image_token: str = "<image>"
+    audio_token: str = "<audio>"
     style: ChatTemplateStyle = ChatTemplateStyle.PLAIN
     def get_prefix_and_suffix(
@@ -253,6 +254,22 @@ register_chat_template(
     )
 )
+# https://huggingface.co/openbmb/MiniCPM-o-2_6
+register_chat_template(
+    ChatTemplate(
+        name="minicpmo",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("user:", " "),
+            "assistant": ("assistant:", "</s>"),
+        },
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+        audio_token="(<audio>./</audio>)",
+    )
+)
 # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
 register_chat_template(
     ChatTemplate(
@@ -474,12 +491,6 @@ def match_chat_ml(model_path: str):
         return get_chat_template("chatml-llava")
-@register_chat_template_matching_function
-def match_chat_minicpm(model_path: str):
-    if "minicpm" in model_path:
-        return get_chat_template("minicpmv")
 @register_chat_template_matching_function
 def match_chat_yi(model_path: str):
     model_path = model_path.lower()
@@ -499,8 +510,10 @@ def match_gemma_it(model_path: str):
 @register_chat_template_matching_function
 def match_openbmb_minicpm(model_path: str):
     model_path = model_path.lower()
-    if "minicpm" in model_path:
+    if "minicpm-v" in model_path:
         return get_chat_template("minicpmv")
+    elif "minicpm-o" in model_path:
+        return get_chat_template("minicpmo")
 @register_chat_template_matching_function
@@ -520,6 +533,14 @@ def match_granite_instruct(model_path: str):
         return get_chat_template("granite-3-instruct")
+@register_chat_template_matching_function
+def match_gemma3_instruct(model_path: str):
+    model_path = model_path.lower()
+    if "gemma-3" in model_path and "1b" not in model_path:
+        # gemma-3-1b-it is completion model
+        return get_chat_template("gemma-it")
 if __name__ == "__main__":
     messages = [
         {"role": "system", "content": None},  # None means default

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/interpreter.py RENAMED Viewed

@@ -566,13 +566,13 @@ class StreamExecutor:
     def _execute_gen(self, expr: SglGen):
         sampling_params = self._resolve_sampling_params(expr.sampling_params)
         name = expr.name
         if not self.stream:
             if self.num_api_spec_tokens is None:
                 comp, meta_info = self.backend.generate(
                     self,
                     sampling_params=sampling_params,
                 )
             else:
                 if self.backend.is_chat_model:
                     # Speculative execution on models with only chat interface.
@@ -587,8 +587,11 @@ class StreamExecutor:
                 else:  # Speculative execution on models with completion interface
                     comp, meta_info = self._spec_gen(sampling_params)
-            self.text_ += comp
+            if isinstance(comp, list):
+                self.text_ += comp[0]
+            else:
+                assert isinstance(comp, str)
+                self.text_ += comp
             self.variables[name] = comp
             self.meta_info[name] = meta_info
@@ -747,6 +750,7 @@ class StreamExecutor:
         for item in [
             "max_new_tokens",
             "min_new_tokens",
+            "n",
             "stop",
             "stop_token_ids",
             "temperature",

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/lang/ir.py RENAMED Viewed

@@ -18,6 +18,7 @@ REGEX_STR = r"\"[\w\d\s]*\""  # bugs with regex r"\".*\"" in interegular pkg
 class SglSamplingParams:
     max_new_tokens: int = 128
     min_new_tokens: int = 0
+    n: int = 1
     stop: Union[str, List[str]] = ()
     stop_token_ids: Optional[List[int]] = ()
     temperature: float = 1.0
@@ -41,6 +42,7 @@ class SglSamplingParams:
         return SglSamplingParams(
             self.max_new_tokens,
             self.min_new_tokens,
+            self.n,
             self.stop,
             self.stop_token_ids,
             self.temperature,
@@ -64,6 +66,7 @@ class SglSamplingParams:
         return {
             "max_tokens": self.max_new_tokens,
             "max_completion_tokens": self.max_new_tokens,
+            "n": self.n,
             "stop": self.stop or None,
             "temperature": self.temperature,
             "top_p": self.top_p,
@@ -117,6 +120,7 @@ class SglSamplingParams:
         return {
             "max_new_tokens": self.max_new_tokens,
             "min_new_tokens": self.min_new_tokens,
+            "n": self.n,
             "stop": self.stop,
             "stop_token_ids": self.stop_token_ids,
             "temperature": self.temperature,
@@ -154,6 +158,7 @@ class SglFunction:
         self,
         *args,
         max_new_tokens: int = 128,
+        n: int = 1,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
         temperature: float = 1.0,
@@ -182,6 +187,7 @@ class SglFunction:
         default_sampling_para = SglSamplingParams(
             max_new_tokens=max_new_tokens,
+            n=n,
             stop=stop,
             stop_token_ids=stop_token_ids,
             temperature=temperature,
@@ -212,6 +218,7 @@ class SglFunction:
         batch_kwargs,
         *,
         max_new_tokens: int = 128,
+        n: int = 1,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
         temperature: float = 1.0,
@@ -257,6 +264,7 @@ class SglFunction:
         default_sampling_para = SglSamplingParams(
             max_new_tokens=max_new_tokens,
+            n=n,
             stop=stop,
             stop_token_ids=stop_token_ids,
             temperature=temperature,
@@ -440,6 +448,7 @@ class SglGen(SglExpr):
         name: Optional[str] = None,
         max_new_tokens: Optional[int] = None,
         min_new_tokens: Optional[int] = None,
+        n: Optional[int] = None,
         stop: Optional[Union[str, List[str]]] = None,
         stop_token_ids: Optional[List[int]] = None,
         temperature: Optional[float] = None,
@@ -463,6 +472,7 @@ class SglGen(SglExpr):
         self.sampling_params = SglSamplingParams(
             max_new_tokens=max_new_tokens,
             min_new_tokens=min_new_tokens,
+            n=n,
             stop=stop,
             stop_token_ids=stop_token_ids,
             temperature=temperature,

{sglang-0.4.4.post1 → sglang-0.4.4.post3}/sglang/srt/_custom_ops.py RENAMED Viewed

@@ -10,7 +10,7 @@ from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
 logger = logging.getLogger(__name__)
 use_vllm_custom_allreduce = get_bool_env_var(
-    "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
+    "USE_VLLM_CUSTOM_ALLREDUCE", default="false"
 )
 if not is_hpu():

sglang 0.4.4.post1__tar.gz → 0.4.4.post3__tar.gz

sglang 0.4.4.post1tar.gz → 0.4.4.post3tar.gz