PyPI - sglang - Versions diffs - 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl - Mend

sglang 0.1.18py3-none-any.whl → 0.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

sglang/__init__.py +1 -1
sglang/api.py +26 -0
sglang/backend/runtime_endpoint.py +18 -14
sglang/bench_latency.py +34 -16
sglang/global_config.py +1 -0
sglang/lang/chat_template.py +41 -6
sglang/lang/interpreter.py +5 -1
sglang/lang/ir.py +61 -25
sglang/srt/constrained/__init__.py +3 -2
sglang/srt/hf_transformers_utils.py +7 -3
sglang/srt/layers/extend_attention.py +2 -1
sglang/srt/layers/fused_moe.py +181 -167
sglang/srt/layers/logits_processor.py +55 -19
sglang/srt/layers/radix_attention.py +24 -27
sglang/srt/layers/token_attention.py +4 -1
sglang/srt/managers/controller/infer_batch.py +2 -2
sglang/srt/managers/controller/manager_single.py +1 -1
sglang/srt/managers/controller/model_runner.py +27 -15
sglang/srt/managers/controller/tp_worker.py +31 -14
sglang/srt/managers/detokenizer_manager.py +4 -2
sglang/srt/managers/io_struct.py +1 -1
sglang/srt/managers/tokenizer_manager.py +14 -13
sglang/srt/model_config.py +6 -0
sglang/srt/models/gemma2.py +436 -0
sglang/srt/models/llama2.py +3 -3
sglang/srt/models/llama_classification.py +10 -7
sglang/srt/models/minicpm.py +373 -0
sglang/srt/models/qwen2_moe.py +454 -0
sglang/srt/openai_api_adapter.py +2 -2
sglang/srt/openai_protocol.py +1 -1
sglang/srt/server.py +17 -8
sglang/srt/server_args.py +14 -16
sglang/srt/utils.py +68 -35
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/METADATA +19 -13
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/RECORD +38 -35
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/WHEEL +0 -0
{sglang-0.1.18.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.18"
+__version__ = "0.1.19"
 # SGL API Components
 from sglang.api import (

sglang/api.py CHANGED Viewed

@@ -67,10 +67,16 @@ def gen(
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
     dtype: Optional[type] = None,
     choices: Optional[List[str]] = None,
     regex: Optional[str] = None,
 ):
+    """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
     if choices:
         return SglSelect(name, choices, 0.0 if temperature is None else temperature)
@@ -91,6 +97,10 @@ def gen(
         frequency_penalty,
         presence_penalty,
         ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
         dtype,
         regex,
     )
@@ -106,6 +116,10 @@ def gen_int(
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
 ):
     return SglGen(
         name,
@@ -117,6 +131,10 @@ def gen_int(
         frequency_penalty,
         presence_penalty,
         ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
         int,
         None,
     )
@@ -132,6 +150,10 @@ def gen_string(
     frequency_penalty: Optional[float] = None,
     presence_penalty: Optional[float] = None,
     ignore_eos: Optional[bool] = None,
+    return_logprob: Optional[bool] = None,
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
 ):
     return SglGen(
         name,
@@ -143,6 +165,10 @@ def gen_string(
         frequency_penalty,
         presence_penalty,
         ignore_eos,
+        return_logprob,
+        logprob_start_len,
+        top_logprobs_num,
+        return_text_in_logprobs,
         str,
         None,
     )

sglang/backend/runtime_endpoint.py CHANGED Viewed

@@ -1,18 +1,18 @@
 import json
-from typing import Callable, List, Optional, Union
+from typing import List, Optional
 import numpy as np
-import requests
 from sglang.backend.base_backend import BaseBackend
 from sglang.global_config import global_config
 from sglang.lang.chat_template import get_chat_template_by_model_path
 from sglang.lang.interpreter import StreamExecutor
-from sglang.lang.ir import SglArgument, SglSamplingParams
-from sglang.utils import encode_image_base64, find_printable_text, http_request
+from sglang.lang.ir import SglSamplingParams
+from sglang.utils import http_request
 class RuntimeEndpoint(BaseBackend):
     def __init__(
         self,
         base_url: str,
@@ -38,8 +38,7 @@ class RuntimeEndpoint(BaseBackend):
         self.model_info = res.json()
         self.chat_template = get_chat_template_by_model_path(
-            self.model_info["model_path"]
-        )
+            self.model_info["model_path"])
     def get_model_name(self):
         return self.model_info["model_path"]
@@ -125,6 +124,11 @@ class RuntimeEndpoint(BaseBackend):
         else:
             raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+        for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                data[item] = value
         self._add_images(s, data)
         res = http_request(
@@ -167,6 +171,11 @@ class RuntimeEndpoint(BaseBackend):
         else:
             raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
+        for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
+            value = getattr(sampling_params, item, None)
+            if value is not None:
+                data[item] = value
         data["stream"] = True
         self._add_images(s, data)
@@ -181,21 +190,16 @@ class RuntimeEndpoint(BaseBackend):
         self._assert_success(res)
         pos = 0
-        incomplete_text = ""
         for chunk in res.iter_lines(decode_unicode=False):
             chunk = chunk.decode("utf-8")
             if chunk and chunk.startswith("data:"):
                 if chunk == "data: [DONE]":
                     break
                 data = json.loads(chunk[5:].strip("\n"))
-                text = find_printable_text(data["text"][pos:])
+                chunk_text = data["text"][pos:]
                 meta_info = data["meta_info"]
-                pos += len(text)
-                incomplete_text = data["text"][pos:]
-                yield text, meta_info
-        if len(incomplete_text) > 0:
-            yield incomplete_text, meta_info
+                pos += len(chunk_text)
+                yield chunk_text, meta_info
     def select(
         self,

sglang/bench_latency.py CHANGED Viewed

@@ -108,7 +108,7 @@ def prepare_inputs(bench_args, tokenizer):
     for i in range(len(prompts)):
         assert len(input_ids[i]) > bench_args.cut_len
-        tmp_input_ids = input_ids[i][:bench_args.cut_len]
+        tmp_input_ids = input_ids[i][: bench_args.cut_len]
         req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
         req.prefix_indices = []
         req.sampling_params = sampling_params
@@ -121,9 +121,9 @@ def prepare_inputs(bench_args, tokenizer):
 def prepare_extend_inputs(bench_args, input_ids, reqs, model_runner):
     for i in range(len(reqs)):
         req = reqs[i]
-        req.input_ids += input_ids[i][bench_args.cut_len:]
+        req.input_ids += input_ids[i][bench_args.cut_len :]
         req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
-            i, :bench_args.cut_len
+            i, : bench_args.cut_len
         ]
     return reqs
@@ -151,7 +151,8 @@ def extend(reqs, model_runner):
         reqs=reqs,
         req_to_token_pool=model_runner.req_to_token_pool,
         token_to_kv_pool=model_runner.token_to_kv_pool,
-        tree_cache=None)
+        tree_cache=None,
+    )
     batch.prepare_for_extend(model_runner.model_config.vocab_size, None)
     output = model_runner.forward(batch, ForwardMode.EXTEND)
     next_token_ids, _ = batch.sample(output.next_token_logits)
@@ -165,6 +166,7 @@ def decode(input_token_ids, batch, model_runner):
     return next_token_ids, output.next_token_logits
+@torch.inference_mode()
 def correctness_test(
     server_args,
     bench_args,
@@ -178,9 +180,10 @@ def correctness_test(
     # Prepare inputs
     input_ids, reqs = prepare_inputs(bench_args, tokenizer)
-    # Prefill
-    next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-    rank_print("prefill logits (first half)", next_token_logits)
+    if bench_args.cut_len > 0:
+        # Prefill
+        next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
+        rank_print("prefill logits (first half)", next_token_logits)
     # Prepare extend inputs
     reqs = prepare_extend_inputs(bench_args, input_ids, reqs, model_runner)
@@ -190,7 +193,7 @@ def correctness_test(
     rank_print("prefill logits (final)", next_token_logits)
     # Decode
-    output_ids = [list(req.input_ids) for req in reqs]
+    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
     for _ in range(bench_args.output_len):
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         for i in range(len(reqs)):
@@ -210,7 +213,9 @@ def latency_test(
     # Load the model
     model_runner, tokenizer = load_model(server_args, tp_rank)
-    print(f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}")
+    print(
+        f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
+    )
     # Prepare inputs
     reqs = prepare_synthetic_inputs(bench_args, tokenizer)
@@ -230,7 +235,9 @@ def latency_test(
         prefill_latency = time.time() - tic
         tot_latency += prefill_latency
         throughput = bench_args.input_len * bench_args.batch_size / prefill_latency
-        rank_print(f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s")
+        rank_print(
+            f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+        )
         # Decode
         for i in range(output_len):
@@ -241,13 +248,24 @@ def latency_test(
             latency = time.time() - tic
             tot_latency += latency
             throughput = bench_args.batch_size / latency
-            if i < 5: rank_print(f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s")
+            if i < 5:
+                rank_print(
+                    f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+                )
         avg_decode_latency = (tot_latency - prefill_latency) / output_len
         avg_decode_throughput = bench_args.batch_size / avg_decode_latency
-        rank_print(f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s")
-        throughput = (bench_args.input_len + bench_args.output_len) * bench_args.batch_size / tot_latency
-        rank_print(f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s")
+        rank_print(
+            f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
+        )
+        throughput = (
+            (bench_args.input_len + bench_args.output_len)
+            * bench_args.batch_size
+            / tot_latency
+        )
+        rank_print(
+            f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
+        )
     # Warm up
     run_once(4)
@@ -296,4 +314,4 @@ if __name__ == "__main__":
         format="%(message)s",
     )
-    main(server_args, bench_args)
+    main(server_args, bench_args)

sglang/global_config.py CHANGED Viewed

@@ -39,4 +39,5 @@ class GlobalConfig:
         # This can improve the speed for large batch sizes during prefill.
         self.layer_sync_threshold = 8192
 global_config = GlobalConfig()

sglang/lang/chat_template.py CHANGED Viewed

@@ -84,7 +84,7 @@ register_chat_template(
             "system": ("SYSTEM:", "\n"),
             "user": ("USER:", "\n"),
             "assistant": ("ASSISTANT:", "\n"),
-        },
+        }
     )
 )
@@ -116,6 +116,23 @@ register_chat_template(
     )
 )
+# There is default system prompt for qwen
+# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
+# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+register_chat_template(
+    ChatTemplate(
+        name="qwen",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",),
+    )
+)
 register_chat_template(
     ChatTemplate(
@@ -132,6 +149,7 @@ register_chat_template(
     )
 )
+# Reference: https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
 register_chat_template(
     ChatTemplate(
         name="vicuna_v1.1",
@@ -148,6 +166,20 @@ register_chat_template(
     )
 )
+# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
+register_chat_template(
+    ChatTemplate(
+        name="yi-1.5",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", ""),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
+            "assistant": ("", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>",)
+    )
+)
 register_chat_template(
     ChatTemplate(
@@ -187,7 +219,7 @@ register_chat_template(
 # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
 register_chat_template(
     ChatTemplate(
-        name="yi",
+        name="yi-vl",
         default_system_prompt=(
             "This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
             "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
@@ -289,8 +321,9 @@ def match_chat_ml(model_path: str):
     model_path = model_path.lower()
     if "tinyllama" in model_path:
         return get_chat_template("chatml")
-    if "qwen" in model_path and "chat" in model_path:
-        return get_chat_template("chatml")
+    # Now the suffix for qwen2 chat model is "instruct"
+    if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path):
+        return get_chat_template("qwen")
     if (
         "llava-v1.6-34b" in model_path
         or "llava-v1.6-yi-34b" in model_path
@@ -302,8 +335,10 @@ def match_chat_ml(model_path: str):
 @register_chat_template_matching_function
 def match_chat_yi(model_path: str):
     model_path = model_path.lower()
-    if "yi" in model_path and "llava" not in model_path:
-        return get_chat_template("yi")
+    if "yi-vl" in model_path and "llava" not in model_path:
+        return get_chat_template("yi-vl")
+    elif "yi-1.5" in model_path and "chat" in model_path:
+        return get_chat_template("yi-1.5")
 @register_chat_template_matching_function

sglang/lang/interpreter.py CHANGED Viewed

@@ -523,9 +523,9 @@ class StreamExecutor:
                 self, sampling_params=sampling_params
             )
+            self.variables[name] = ""
             self.stream_var_event[name].set()
-            self.variables[name] = ""
             for comp, meta_info in generator:
                 self.text_ += comp
                 self.variables[name] += comp
@@ -668,6 +668,10 @@ class StreamExecutor:
             "frequency_penalty",
             "presence_penalty",
             "ignore_eos",
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
             "dtype",
             "regex",
         ]:

sglang/lang/ir.py CHANGED Viewed

@@ -23,6 +23,10 @@ class SglSamplingParams:
     frequency_penalty: float = 0.0
     presence_penalty: float = 0.0
     ignore_eos: bool = False
+    return_logprob: Optional[bool] = None
+    logprob_start_len: Optional[int] = None,
+    top_logprobs_num: Optional[int] = None,
+    return_text_in_logprobs: Optional[bool] = None,
     # for constrained generation, not included in to_xxx_kwargs
     dtype: Optional[str] = None
@@ -37,6 +41,11 @@ class SglSamplingParams:
             self.top_k,
             self.frequency_penalty,
             self.presence_penalty,
+            self.ignore_eos,
+            self.return_logprob,
+            self.logprob_start_len,
+            self.top_logprobs_num,
+            self.return_text_in_logprobs,
         )
     def to_openai_kwargs(self):
@@ -139,6 +148,10 @@ class SglFunction:
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         ignore_eos: bool = False,
+        return_logprob: Optional[bool] = None,
+        logprob_start_len: Optional[int] = None,
+        top_logprobs_num: Optional[int] = None,
+        return_text_in_logprobs: Optional[bool] = None,
         stream: bool = False,
         backend=None,
         **kwargs,
@@ -154,6 +167,10 @@ class SglFunction:
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
             ignore_eos=ignore_eos,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            return_text_in_logprobs=return_text_in_logprobs,
         )
         backend = backend or global_config.default_backend
         return run_program(self, backend, args, kwargs, default_sampling_para, stream)
@@ -170,6 +187,10 @@ class SglFunction:
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         ignore_eos: bool = False,
+        return_logprob: Optional[bool] = None,
+        logprob_start_len: Optional[int] = None,
+        top_logprobs_num: Optional[int] = None,
+        return_text_in_logprobs: Optional[bool] = None,
         backend=None,
         num_threads: Union[str, int] = "auto",
         progress_bar: bool = False,
@@ -185,8 +206,10 @@ class SglFunction:
             batch_kwargs = [
                 {self.arg_names[i]: v for i, v in enumerate(arg_values)}
                 for arg_values in batch_kwargs
-                if isinstance(arg_values, (list, tuple)) and
-                   len(self.arg_names) - len(self.arg_defaults) <= len(arg_values) <= len(self.arg_names)
+                if isinstance(arg_values, (list, tuple))
+                and len(self.arg_names) - len(self.arg_defaults)
+                <= len(arg_values)
+                <= len(self.arg_names)
             ]
             # Ensure to raise an exception if the number of arguments mismatch
             if len(batch_kwargs) != num_programs:
@@ -201,6 +224,10 @@ class SglFunction:
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
             ignore_eos=ignore_eos,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            return_text_in_logprobs=return_text_in_logprobs,
         )
         backend = backend or global_config.default_backend
         return run_program_batch(
@@ -348,7 +375,7 @@ class SglArgument(SglExpr):
 class SglImage(SglExpr):
-    def __init__(self, path):
+    def __init__(self, path: str):
         self.path = path
     def __repr__(self) -> str:
@@ -356,7 +383,7 @@ class SglImage(SglExpr):
 class SglVideo(SglExpr):
-    def __init__(self, path, num_frames):
+    def __init__(self, path: str, num_frames: int):
         self.path = path
         self.num_frames = num_frames
@@ -367,18 +394,23 @@ class SglVideo(SglExpr):
 class SglGen(SglExpr):
     def __init__(
         self,
-        name,
-        max_new_tokens,
-        stop,
-        temperature,
-        top_p,
-        top_k,
-        frequency_penalty,
-        presence_penalty,
-        ignore_eos,
-        dtype,
-        regex,
+        name: Optional[str] = None,
+        max_new_tokens: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        top_k: Optional[int] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        ignore_eos: Optional[bool] = None,
+        return_logprob: Optional[bool] = None,
+        logprob_start_len: Optional[int] = None,
+        top_logprobs_num: Optional[int] = None,
+        return_text_in_logprobs: Optional[bool] = None,
+        dtype: Optional[type] = None,
+        regex: Optional[str] = None,
     ):
+        """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
         super().__init__()
         self.name = name
         self.sampling_params = SglSamplingParams(
@@ -390,6 +422,10 @@ class SglGen(SglExpr):
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
             ignore_eos=ignore_eos,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            return_text_in_logprobs=return_text_in_logprobs,
             dtype=dtype,
             regex=regex,
         )
@@ -399,7 +435,7 @@ class SglGen(SglExpr):
 class SglConstantText(SglExpr):
-    def __init__(self, value):
+    def __init__(self, value: str):
         super().__init__()
         self.value = value
@@ -408,7 +444,7 @@ class SglConstantText(SglExpr):
 class SglRoleBegin(SglExpr):
-    def __init__(self, role):
+    def __init__(self, role: str):
         super().__init__()
         self.role = role
@@ -417,7 +453,7 @@ class SglRoleBegin(SglExpr):
 class SglRoleEnd(SglExpr):
-    def __init__(self, role):
+    def __init__(self, role: str):
         super().__init__()
         self.role = role
@@ -426,7 +462,7 @@ class SglRoleEnd(SglExpr):
 class SglSelect(SglExpr):
-    def __init__(self, name, choices, temperature):
+    def __init__(self, name: str, choices: List[str], temperature: float):
         super().__init__()
         self.name = name
         self.choices = choices
@@ -437,7 +473,7 @@ class SglSelect(SglExpr):
 class SglFork(SglExpr):
-    def __init__(self, number, position_ids_offset=None):
+    def __init__(self, number: int, position_ids_offset=None):
         super().__init__()
         self.number = number
         self.position_ids_offset = position_ids_offset
@@ -450,7 +486,7 @@ class SglFork(SglExpr):
 class SglGetForkItem(SglExpr):
-    def __init__(self, index):
+    def __init__(self, index: int):
         super().__init__()
         self.index = index
@@ -459,7 +495,7 @@ class SglGetForkItem(SglExpr):
 class SglVariable(SglExpr):
-    def __init__(self, name, source):
+    def __init__(self, name: str, source):
         super().__init__()
         self.name = name
         self.source = source
@@ -469,7 +505,7 @@ class SglVariable(SglExpr):
 class SglVarScopeBegin(SglExpr):
-    def __init__(self, name):
+    def __init__(self, name: str):
         super().__init__()
         self.name = name
@@ -478,7 +514,7 @@ class SglVarScopeBegin(SglExpr):
 class SglVarScopeEnd(SglExpr):
-    def __init__(self, name):
+    def __init__(self, name: str):
         super().__init__()
         self.name = name
@@ -500,4 +536,4 @@ class SglCommitLazy(SglExpr):
         super().__init__()
     def __repr__(self):
-        return f"CommitLazy()"
+        return "CommitLazy()"

sglang/srt/constrained/__init__.py CHANGED Viewed

@@ -5,13 +5,14 @@ from pydantic import BaseModel
 try:
     from outlines.caching import cache as disk_cache
-    from outlines.fsm.guide import RegexGuide
     from outlines.caching import disable_cache
     from outlines.fsm.guide import RegexGuide
     from outlines.fsm.regex import FSMInfo, make_byte_level_fsm, make_deterministic_fsm
     from outlines.models.transformers import TransformerTokenizer
 except ImportError as e:
-    print(f'\nError: {e}. Please install a new version of outlines by `pip install "outlines>=0.0.44"`\n')
+    print(
+        f'\nError: {e}. Please install a new version of outlines by `pip install "outlines>=0.0.44"`\n'
+    )
     raise
 try:

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -264,7 +264,9 @@ class TiktokenTokenizer:
         return self.tokenizer.decode_batch(batch)
     def apply_chat_template(self, messages, tokenize, add_generation_prompt):
-        ret = self.chat_template.render(messages=messages, add_generation_prompt=add_generation_prompt)
+        ret = self.chat_template.render(
+            messages=messages, add_generation_prompt=add_generation_prompt
+        )
         return self.encode(ret) if tokenize else ret
@@ -297,5 +299,7 @@ class SentencePieceTokenizer:
         return self.tokenizer.decode(batch)
     def apply_chat_template(self, messages, tokenize, add_generation_prompt):
-        ret = self.chat_template.render(messages=messages, add_generation_prompt=add_generation_prompt)
-        return self.encode(ret) if tokenize else ret
+        ret = self.chat_template.render(
+            messages=messages, add_generation_prompt=add_generation_prompt
+        )
+        return self.encode(ret) if tokenize else ret

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -191,6 +191,7 @@ def extend_attention_fwd(
     b_seq_len_extend,
     max_len_in_batch,
     max_len_extend,
+    sm_scale=None,
     logit_cap=-1,
 ):
     """
@@ -213,7 +214,7 @@ def extend_attention_fwd(
     else:
         BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
-    sm_scale = 1.0 / (Lq**0.5)
+    sm_scale = 1.0 / (Lq**0.5) if sm_scale is None else sm_scale
     batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
     kv_group_num = q_extend.shape[1] // k_extend.shape[1]

sglang 0.1.18__py3-none-any.whl → 0.1.19__py3-none-any.whl

sglang 0.1.18py3-none-any.whl → 0.1.19py3-none-any.whl