PyPI - sglang - Versions diffs - 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl - Mend

sglang 0.1.18py3-none-any.whl → 0.1.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

sglang/__init__.py +1 -1
sglang/api.py +26 -0
sglang/backend/runtime_endpoint.py +18 -14
sglang/bench_latency.py +40 -18
sglang/global_config.py +21 -16
sglang/lang/chat_template.py +41 -6
sglang/lang/interpreter.py +5 -1
sglang/lang/ir.py +61 -25
sglang/srt/constrained/__init__.py +3 -2
sglang/srt/hf_transformers_utils.py +7 -3
sglang/srt/layers/extend_attention.py +2 -1
sglang/srt/layers/fused_moe.py +181 -167
sglang/srt/layers/logits_processor.py +55 -19
sglang/srt/layers/radix_attention.py +33 -59
sglang/srt/layers/token_attention.py +4 -8
sglang/srt/managers/controller/cuda_graph_runner.py +172 -0
sglang/srt/managers/controller/infer_batch.py +244 -36
sglang/srt/managers/controller/manager_single.py +1 -1
sglang/srt/managers/controller/model_runner.py +69 -284
sglang/srt/managers/controller/tp_worker.py +39 -20
sglang/srt/managers/detokenizer_manager.py +4 -2
sglang/srt/managers/io_struct.py +1 -1
sglang/srt/managers/tokenizer_manager.py +14 -13
sglang/srt/memory_pool.py +33 -6
sglang/srt/model_config.py +6 -0
sglang/srt/models/gemma2.py +436 -0
sglang/srt/models/llama2.py +3 -3
sglang/srt/models/llama_classification.py +10 -7
sglang/srt/models/minicpm.py +373 -0
sglang/srt/models/qwen2_moe.py +454 -0
sglang/srt/openai_api_adapter.py +2 -2
sglang/srt/openai_protocol.py +1 -1
sglang/srt/server.py +18 -8
sglang/srt/server_args.py +24 -20
sglang/srt/utils.py +68 -35
{sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/METADATA +19 -13
{sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/RECORD +40 -36
{sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/WHEEL +1 -1
{sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/LICENSE +0 -0
{sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.18"
+__version__ = "0.1.20"
 # SGL API Components
 from sglang.api import (

sglang/api.py CHANGED Viewed

@@ -67,10 +67,16 @@ def gen(
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
     dtype: Optional[type] = None,
     choices: Optional[List[str]] = None,
     regex: Optional[str] = None,
 ):
+    """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
     if choices:
         return SglSelect(name, choices, 0.0 if temperature is None else temperature)
@@ -91,6 +97,10 @@ def gen(
         frequency_penalty,
         presence_penalty,
         ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
         dtype,
         regex,
     )
@@ -106,6 +116,10 @@ def gen_int(
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
 ):
     return SglGen(
         name,
@@ -117,6 +131,10 @@ def gen_int(
         frequency_penalty,
         presence_penalty,
         ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
         int,
         None,
     )
@@ -132,6 +150,10 @@ def gen_string(
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
 ):
     return SglGen(
         name,
@@ -143,6 +165,10 @@ def gen_string(
         frequency_penalty,
         presence_penalty,
         ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
         str,
         None,
     )

sglang/backend/runtime_endpoint.py CHANGED Viewed

@@ -1,18 +1,18 @@
 import json
-from typing import Callable, List, Optional, Union
+from typing import List, Optional
 import numpy as np
-import requests
 from sglang.backend.base_backend import BaseBackend
 from sglang.global_config import global_config
 from sglang.lang.chat_template import get_chat_template_by_model_path
 from sglang.lang.interpreter import StreamExecutor
-from sglang.lang.ir import SglArgument, SglSamplingParams
-from sglang.utils import encode_image_base64, find_printable_text, http_request
+from sglang.lang.ir import SglSamplingParams
+from sglang.utils import http_request
 class RuntimeEndpoint(BaseBackend):
     def __init__(
         self,
         base_url: str,
@@ -38,8 +38,7 @@ class RuntimeEndpoint(BaseBackend):
         self.model_info = res.json()
         self.chat_template = get_chat_template_by_model_path(
-            self.model_info["model_path"]
-        )
+            self.model_info["model_path"])
     def get_model_name(self):
         return self.model_info["model_path"]
@@ -125,6 +124,11 @@ class RuntimeEndpoint(BaseBackend):
         else:
             raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+        for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                data[item] = value
         self._add_images(s, data)
         res = http_request(
@@ -167,6 +171,11 @@ class RuntimeEndpoint(BaseBackend):
         else:
             raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+        for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                data[item] = value
         data["stream"] = True
         self._add_images(s, data)
@@ -181,21 +190,16 @@ class RuntimeEndpoint(BaseBackend):
         self._assert_success(res)
         pos = 0
-        incomplete_text = ""
         for chunk in res.iter_lines(decode_unicode=False):
             chunk = chunk.decode("utf-8")
             if chunk and chunk.startswith("data:"):
                 if chunk == "data: [DONE]":
                     break
                 data = json.loads(chunk[5:].strip("\n"))
-                text = find_printable_text(data["text"][pos:])
+                chunk_text = data["text"][pos:]
                 meta_info = data["meta_info"]
-                pos += len(text)
-                incomplete_text = data["text"][pos:]
-                yield text, meta_info
-        if len(incomplete_text) > 0:
-            yield incomplete_text, meta_info
+                pos += len(chunk_text)
+                yield chunk_text, meta_info
     def select(
         self,

sglang/bench_latency.py CHANGED Viewed

@@ -32,6 +32,7 @@ import logging
 import multiprocessing
 import time
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -70,6 +71,7 @@ class BenchArgs:
 def load_model(server_args, tp_rank):
     suppress_other_loggers()
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
     model_config = ModelConfig(path=server_args.model_path)
     model_runner = ModelRunner(
@@ -81,7 +83,7 @@ def load_model(server_args, tp_rank):
         nccl_port=28888,
         server_args=server_args,
     )
-    print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
+    rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
     tokenizer = get_tokenizer(
         server_args.tokenizer_path,
         tokenizer_mode=server_args.tokenizer_mode,
@@ -108,7 +110,7 @@ def prepare_inputs(bench_args, tokenizer):
     for i in range(len(prompts)):
         assert len(input_ids[i]) > bench_args.cut_len
-        tmp_input_ids = input_ids[i][:bench_args.cut_len]
+        tmp_input_ids = input_ids[i][: bench_args.cut_len]
         req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
         req.prefix_indices = []
         req.sampling_params = sampling_params
@@ -121,9 +123,9 @@ def prepare_inputs(bench_args, tokenizer):
 def prepare_extend_inputs(bench_args, input_ids, reqs, model_runner):
     for i in range(len(reqs)):
         req = reqs[i]
-        req.input_ids += input_ids[i][bench_args.cut_len:]
+        req.input_ids += input_ids[i][bench_args.cut_len :]
         req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
-            i, :bench_args.cut_len
+            i, : bench_args.cut_len
         ]
     return reqs
@@ -151,7 +153,8 @@ def extend(reqs, model_runner):
         reqs=reqs,
         req_to_token_pool=model_runner.req_to_token_pool,
         token_to_kv_pool=model_runner.token_to_kv_pool,
-        tree_cache=None)
+        tree_cache=None,
+    )
     batch.prepare_for_extend(model_runner.model_config.vocab_size, None)
     output = model_runner.forward(batch, ForwardMode.EXTEND)
     next_token_ids, _ = batch.sample(output.next_token_logits)
@@ -165,6 +168,7 @@ def decode(input_token_ids, batch, model_runner):
     return next_token_ids, output.next_token_logits
+@torch.inference_mode()
 def correctness_test(
     server_args,
     bench_args,
@@ -178,9 +182,10 @@ def correctness_test(
     # Prepare inputs
     input_ids, reqs = prepare_inputs(bench_args, tokenizer)
-    # Prefill
-    next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-    rank_print("prefill logits (first half)", next_token_logits)
+    if bench_args.cut_len > 0:
+        # Prefill
+        next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
+        rank_print("prefill logits (first half)", next_token_logits)
     # Prepare extend inputs
     reqs = prepare_extend_inputs(bench_args, input_ids, reqs, model_runner)
@@ -190,7 +195,7 @@ def correctness_test(
     rank_print("prefill logits (final)", next_token_logits)
     # Decode
-    output_ids = [list(req.input_ids) for req in reqs]
+    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
     for _ in range(bench_args.output_len):
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         for i in range(len(reqs)):
@@ -198,7 +203,7 @@ def correctness_test(
     # Print
     for i in range(len(reqs)):
-        print(tokenizer.decode(output_ids[i]))
+        rank_print(tokenizer.decode(output_ids[i]))
 def latency_test(
@@ -210,7 +215,9 @@ def latency_test(
     # Load the model
     model_runner, tokenizer = load_model(server_args, tp_rank)
-    print(f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}")
+    rank_print(
+        f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
+    )
     # Prepare inputs
     reqs = prepare_synthetic_inputs(bench_args, tokenizer)
@@ -230,7 +237,9 @@ def latency_test(
         prefill_latency = time.time() - tic
         tot_latency += prefill_latency
         throughput = bench_args.input_len * bench_args.batch_size / prefill_latency
-        rank_print(f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s")
+        rank_print(
+            f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+        )
         # Decode
         for i in range(output_len):
@@ -241,13 +250,24 @@ def latency_test(
             latency = time.time() - tic
             tot_latency += latency
             throughput = bench_args.batch_size / latency
-            if i < 5: rank_print(f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s")
+            if i < 5:
+                rank_print(
+                    f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+                )
         avg_decode_latency = (tot_latency - prefill_latency) / output_len
         avg_decode_throughput = bench_args.batch_size / avg_decode_latency
-        rank_print(f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s")
-        throughput = (bench_args.input_len + bench_args.output_len) * bench_args.batch_size / tot_latency
-        rank_print(f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s")
+        rank_print(
+            f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
+        )
+        throughput = (
+            (bench_args.input_len + bench_args.output_len)
+            * bench_args.batch_size
+            / tot_latency
+        )
+        rank_print(
+            f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
+        )
     # Warm up
     run_once(4)
@@ -281,6 +301,8 @@ def main(server_args, bench_args):
     for proc in workers:
         proc.join()
+    proc.terminate()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -296,4 +318,4 @@ if __name__ == "__main__":
         format="%(message)s",
     )
-    main(server_args, bench_args)
+    main(server_args, bench_args)

sglang/global_config.py CHANGED Viewed

@@ -8,35 +8,40 @@ class GlobalConfig:
         # 2: output final text after every run
         self.verbosity = 0
+        # Default backend of the language
         self.default_backend = None
-        # Output configs
+        # Runtime constants: Request dependency time due to network delay
+        self.request_dependency_delay = 0.02
+        self.wait_for_new_request_delay = 0.0006
+        # Runtime constants: New generation token ratio estimation
+        self.base_new_token_ratio = 0.4
+        self.base_min_new_token_ratio = 0.2
+        self.new_token_ratio_decay = 0.0001
+        self.new_token_ratio_recovery = 0.05
+        # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
+        # This can improve the speed for large batch sizes during prefill.
+        self.layer_sync_threshold = 8192
+        # Runtime constants: Flashinfer
+        self.flashinfer_workspace_size = 192 * 1024 * 1024
+        # Output tokenization configs
         self.skip_special_tokens_in_output = True
         self.spaces_between_special_tokens_in_out = True
-        # Optimization configs
+        # Interpreter optimization configs
         self.eager_fill_image = False
         self.enable_precache_with_tracing = True
         self.enable_parallel_encoding = True
         self.enable_parallel_decoding = True
+        # Deprecated
         # Choices: ["no_adjust", "adjust_cache"]
         # no_adjust: Do not adjust the position embedding of KV cache.
         # adjust_cache: Adjust the position embedding of KV cache.
         self.concate_and_append_mode = "no_adjust"
-        # Request dependency time due to network delay
-        self.request_dependency_delay = 0.02
-        self.wait_for_new_request_delay = 0.0006
-        # New generation token ratio estimation
-        self.base_new_token_ratio = 0.4
-        self.base_min_new_token_ratio = 0.2
-        self.new_token_ratio_decay = 0.0001
-        self.new_token_ratio_recovery = 0.05
-        # The threshold (number of tokens) to trigger layer-wise cuda sync.
-        # This can improve the speed for large batch sizes during prefill.
-        self.layer_sync_threshold = 8192
 global_config = GlobalConfig()

sglang/lang/chat_template.py CHANGED Viewed

@@ -84,7 +84,7 @@ register_chat_template(
             "system": ("SYSTEM:", "\n"),
             "user": ("USER:", "\n"),
             "assistant": ("ASSISTANT:", "\n"),
-        },
+        }
     )
 )
@@ -116,6 +116,23 @@ register_chat_template(
     )
 )
+# There is default system prompt for qwen
+# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
+# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+register_chat_template(
+    ChatTemplate(
+        name="qwen",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+    )
+)
 register_chat_template(
     ChatTemplate(
@@ -132,6 +149,7 @@ register_chat_template(
     )
 )
+# Reference: https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
 register_chat_template(
     ChatTemplate(
         name="vicuna_v1.1",
@@ -148,6 +166,20 @@ register_chat_template(
     )
 )
+# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
+register_chat_template(
+    ChatTemplate(
+        name="yi-1.5",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
+            "assistant": ("", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",)
+    )
+)
 register_chat_template(
     ChatTemplate(
@@ -187,7 +219,7 @@ register_chat_template(
 # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
 register_chat_template(
     ChatTemplate(
-        name="yi",
+        name="yi-vl",
         default_system_prompt=(
             "This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
             "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
@@ -289,8 +321,9 @@ def match_chat_ml(model_path: str):
     model_path = model_path.lower()
     if "tinyllama" in model_path:
         return get_chat_template("chatml")
-    if "qwen" in model_path and "chat" in model_path:
-        return get_chat_template("chatml")
+    # Now the suffix for qwen2 chat model is "instruct"
+    if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path):
+        return get_chat_template("qwen")
     if (
         "llava-v1.6-34b" in model_path
         or "llava-v1.6-yi-34b" in model_path
@@ -302,8 +335,10 @@ def match_chat_ml(model_path: str):
 @register_chat_template_matching_function
 def match_chat_yi(model_path: str):
     model_path = model_path.lower()
-    if "yi" in model_path and "llava" not in model_path:
-        return get_chat_template("yi")
+    if "yi-vl" in model_path and "llava" not in model_path:
+        return get_chat_template("yi-vl")
+    elif "yi-1.5" in model_path and "chat" in model_path:
+        return get_chat_template("yi-1.5")
 @register_chat_template_matching_function

sglang/lang/interpreter.py CHANGED Viewed

@@ -523,9 +523,9 @@ class StreamExecutor:
                 self, sampling_params=sampling_params
             )
+            self.variables[name] = ""
             self.stream_var_event[name].set()
-            self.variables[name] = ""
             for comp, meta_info in generator:
                 self.text_ += comp
                 self.variables[name] += comp
@@ -668,6 +668,10 @@ class StreamExecutor:
             "frequency_penalty",
             "presence_penalty",
             "ignore_eos",
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
             "dtype",
             "regex",
         ]:

sglang 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl

sglang 0.1.18py3-none-any.whl → 0.1.20py3-none-any.whl