PyPI - sglang - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

sglang 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

sglang/__init__.py +55 -2
sglang/api.py +3 -5
sglang/backend/anthropic.py +18 -4
sglang/backend/openai.py +2 -1
sglang/backend/runtime_endpoint.py +18 -5
sglang/backend/vertexai.py +1 -0
sglang/global_config.py +1 -0
sglang/lang/chat_template.py +74 -0
sglang/lang/interpreter.py +40 -16
sglang/lang/tracer.py +6 -4
sglang/launch_server.py +2 -1
sglang/srt/constrained/fsm_cache.py +1 -0
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/conversation.py +2 -2
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/context_flashattention_nopad.py +1 -0
sglang/srt/layers/extend_attention.py +1 -0
sglang/srt/layers/logits_processor.py +114 -54
sglang/srt/layers/radix_attention.py +2 -1
sglang/srt/layers/token_attention.py +1 -0
sglang/srt/managers/detokenizer_manager.py +5 -1
sglang/srt/managers/io_struct.py +12 -0
sglang/srt/managers/router/infer_batch.py +70 -33
sglang/srt/managers/router/manager.py +7 -2
sglang/srt/managers/router/model_rpc.py +116 -73
sglang/srt/managers/router/model_runner.py +111 -167
sglang/srt/managers/router/radix_cache.py +46 -38
sglang/srt/managers/tokenizer_manager.py +56 -11
sglang/srt/memory_pool.py +5 -14
sglang/srt/model_config.py +7 -0
sglang/srt/models/commandr.py +376 -0
sglang/srt/models/dbrx.py +413 -0
sglang/srt/models/dbrx_config.py +281 -0
sglang/srt/models/gemma.py +22 -20
sglang/srt/models/llama2.py +23 -21
sglang/srt/models/llava.py +12 -10
sglang/srt/models/mixtral.py +27 -25
sglang/srt/models/qwen.py +23 -21
sglang/srt/models/qwen2.py +23 -21
sglang/srt/models/stablelm.py +20 -21
sglang/srt/models/yivl.py +6 -5
sglang/srt/openai_api_adapter.py +356 -0
sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
sglang/srt/sampling_params.py +2 -0
sglang/srt/server.py +68 -447
sglang/srt/server_args.py +76 -49
sglang/srt/utils.py +88 -32
sglang/srt/weight_utils.py +402 -0
sglang/test/test_programs.py +8 -7
sglang/test/test_utils.py +195 -7
{sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/METADATA +12 -14
sglang-0.1.15.dist-info/RECORD +69 -0
sglang-0.1.14.dist-info/RECORD +0 -64
{sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/WHEEL +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,4 +1,57 @@
-__version__ = "0.1.14"
+__version__ = "0.1.15"
-from sglang.api import *
+# SGL API Components
+from sglang.api import (
+    Runtime,
+    assistant,
+    assistant_begin,
+    assistant_end,
+    flush_cache,
+    function,
+    gen,
+    gen_int,
+    gen_string,
+    get_server_args,
+    image,
+    select,
+    set_default_backend,
+    system,
+    user,
+    user_begin,
+    user_end,
+)
+# SGL Backends
+from sglang.backend.anthropic import Anthropic
+from sglang.backend.openai import OpenAI
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.backend.vertexai import VertexAI
+# Global Configurations
 from sglang.global_config import global_config
+# public APIs management
+__all__ = [
+    "global_config",
+    "Anthropic",
+    "OpenAI",
+    "RuntimeEndpoint",
+    "VertexAI",
+    "function",
+    "Runtime",
+    "set_default_backend",
+    "flush_cache",
+    "get_server_args",
+    "gen",
+    "gen_int",
+    "gen_string",
+    "image",
+    "select",
+    "system",
+    "user",
+    "assistant",
+    "user_begin",
+    "user_end",
+    "assistant_begin",
+    "assistant_end",
+]

sglang/api.py CHANGED Viewed

@@ -1,13 +1,10 @@
-"""Public API"""
+"""Some Public API Definitions"""
+import os
 import re
 from typing import Callable, List, Optional, Union
-from sglang.backend.anthropic import Anthropic
 from sglang.backend.base_backend import BaseBackend
-from sglang.backend.openai import OpenAI
-from sglang.backend.runtime_endpoint import RuntimeEndpoint
-from sglang.backend.vertexai import VertexAI
 from sglang.global_config import global_config
 from sglang.lang.ir import (
     SglExpr,
@@ -35,6 +32,7 @@ def function(
 def Runtime(*args, **kwargs):
     # Avoid importing unnecessary dependency
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
     from sglang.srt.server import Runtime
     return Runtime(*args, **kwargs)

sglang/backend/anthropic.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Optional, Union
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.interpreter import StreamExecutor
@@ -13,7 +14,7 @@ except ImportError as e:
 class Anthropic(BaseBackend):
-    def __init__(self, model_name):
+    def __init__(self, model_name, *args, **kwargs):
         super().__init__()
         if isinstance(anthropic, Exception):
@@ -21,6 +22,7 @@ class Anthropic(BaseBackend):
         self.model_name = model_name
         self.chat_template = get_chat_template("claude")
+        self.client = anthropic.Anthropic(*args, **kwargs)
     def get_chat_template(self):
         return self.chat_template
@@ -35,8 +37,14 @@ class Anthropic(BaseBackend):
         else:
             messages = [{"role": "user", "content": s.text_}]
-        ret = anthropic.Anthropic().messages.create(
+        if messages and messages[0]["role"] == "system":
+            system = messages.pop(0)["content"]
+        else:
+            system = ""
+        ret = self.client.messages.create(
             model=self.model_name,
+            system=system,
             messages=messages,
             **sampling_params.to_anthropic_kwargs(),
         )
@@ -54,10 +62,16 @@ class Anthropic(BaseBackend):
         else:
             messages = [{"role": "user", "content": s.text_}]
-        with anthropic.Anthropic().messages.stream(
+        if messages and messages[0]["role"] == "system":
+            system = messages.pop(0)["content"]
+        else:
+            system = ""
+        with self.client.messages.stream(
             model=self.model_name,
+            system=system,
             messages=messages,
             **sampling_params.to_anthropic_kwargs(),
         ) as stream:
             for text in stream.text_stream:
-                yield text, {}
+                yield text, {}

sglang/backend/openai.py CHANGED Viewed

@@ -3,6 +3,7 @@ import time
 from typing import Callable, List, Optional, Union
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
 from sglang.lang.interpreter import StreamExecutor
@@ -227,7 +228,7 @@ class OpenAI(BaseBackend):
             prompt_tokens.append(ret_token)
         decision = choices[np.argmax(scores)]
-        return decision, scores, scores
+        return decision, scores, None, None
 def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):

sglang/backend/runtime_endpoint.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Callable, List, Optional, Union
 import numpy as np
 import requests
 from sglang.backend.base_backend import BaseBackend
 from sglang.global_config import global_config
 from sglang.lang.chat_template import get_chat_template_by_model_path
@@ -73,9 +74,11 @@ class RuntimeEndpoint(BaseBackend):
         assert res.status_code == 200
     def commit_lazy_operations(self, s: StreamExecutor):
+        data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
+        self._add_images(s, data)
         res = http_request(
             self.base_url + "/generate",
-            json={"text": s.text_, "sampling_params": {"max_new_tokens": 0}},
+            json=data,
             auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
@@ -104,6 +107,7 @@ class RuntimeEndpoint(BaseBackend):
                 "text": s.text_,
                 "sampling_params": {
                     "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
                     **sampling_params.to_srt_kwargs(),
                 },
             }
@@ -112,6 +116,7 @@ class RuntimeEndpoint(BaseBackend):
                 "text": s.text_,
                 "sampling_params": {
                     "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
                     "dtype": "int",
                     **sampling_params.to_srt_kwargs(),
                 },
@@ -142,6 +147,7 @@ class RuntimeEndpoint(BaseBackend):
                 "text": s.text_,
                 "sampling_params": {
                     "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
                     **sampling_params.to_srt_kwargs(),
                 },
             }
@@ -150,6 +156,7 @@ class RuntimeEndpoint(BaseBackend):
                 "text": s.text_,
                 "sampling_params": {
                     "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
                     "dtype": "int",
                     **sampling_params.to_srt_kwargs(),
                 },
@@ -224,13 +231,19 @@ class RuntimeEndpoint(BaseBackend):
         )
         assert res.status_code == 200
         obj = res.json()
-        normalized_prompt_logprob = [
+        normalized_prompt_logprobs = [
             r["meta_info"]["normalized_prompt_logprob"] for r in obj
         ]
-        prompt_logprob = [r["meta_info"]["prompt_logprob"] for r in obj]
+        decision = choices[np.argmax(normalized_prompt_logprobs)]
+        prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
+        decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
-        decision = choices[np.argmax(normalized_prompt_logprob)]
-        return decision, normalized_prompt_logprob, prompt_logprob
+        return (
+            decision,
+            normalized_prompt_logprobs,
+            prefill_token_logprobs,
+            decode_token_logprobs,
+        )
     def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
         res = http_request(

sglang/backend/vertexai.py CHANGED Viewed

@@ -3,6 +3,7 @@ import warnings
 from typing import List, Optional, Union
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.interpreter import StreamExecutor

sglang/global_config.py CHANGED Viewed

@@ -12,6 +12,7 @@ class GlobalConfig:
         # Output configs
         self.skip_special_tokens_in_output = True
+        self.spaces_between_special_tokens_in_out = True
         # Optimization configs
         self.eager_fill_image = False

sglang/lang/chat_template.py CHANGED Viewed

@@ -162,6 +162,28 @@ register_chat_template(
     )
 )
+register_chat_template(
+    ChatTemplate(
+        name="llama-3-instruct",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|start_header_id|>system<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "user": (
+                "<|start_header_id|>user<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "assistant": (
+                "<|start_header_id|>assistant<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+        },
+        stop_str=("<|eot_id|>",),
+    )
+)
 # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
 register_chat_template(
     ChatTemplate(
@@ -192,6 +214,44 @@ register_chat_template(
     )
 )
+register_chat_template(
+    ChatTemplate(
+        name="dbrx-instruct",
+        default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>"),
+            "user": ("\n<|im_start|>user\n", "<|im_end|>"),
+            "assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
+        },
+        stop_str=("<|im_end|>",),
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="c4ai-command-r",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
+                "<|END_OF_TURN_TOKEN|>",
+            ),
+            "user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
+            "assistant": (
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+                "<|END_OF_TURN_TOKEN|>",
+            ),
+        },
+        style=ChatTemplateStyle.PLAIN,
+    )
+)
+@register_chat_template_matching_function
+def match_dbrx(model_path: str):
+    if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
+        return get_chat_template("dbrx-instruct")
 @register_chat_template_matching_function
 def match_vicuna(model_path: str):
@@ -214,6 +274,13 @@ def match_llama2_chat(model_path: str):
         return get_chat_template("llama-2-chat")
+@register_chat_template_matching_function
+def match_llama3_instruct(model_path: str):
+    model_path = model_path.lower()
+    if "llama-3" in model_path and "instruct" in model_path:
+        return get_chat_template("llama-3-instruct")
 @register_chat_template_matching_function
 def match_chat_ml(model_path: str):
     model_path = model_path.lower()
@@ -239,6 +306,13 @@ def match_gemma_it(model_path: str):
         return get_chat_template("gemma-it")
+@register_chat_template_matching_function
+def match_c4ai_command_r(model_path: str):
+    model_path = model_path.lower()
+    if "c4ai-command-r" in model_path:
+        return get_chat_template("c4ai-command-r")
 if __name__ == "__main__":
     messages = [
         {"role": "system", "content": None},  # None means default

sglang/lang/interpreter.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """The interpreter that executes SGL programs"""
 import asyncio
+import contextvars
 import multiprocessing
 import queue
 import threading
@@ -10,6 +11,7 @@ from contextlib import contextmanager
 from typing import Any, Callable, Dict, List, Optional, Union
 import tqdm
 from sglang.global_config import global_config
 from sglang.lang.ir import (
     SglCommitLazy,
@@ -217,7 +219,13 @@ class StreamExecutor:
         self.use_thread = use_thread
         if self.use_thread:
             self.queue = queue.Queue()
-            self.worker = threading.Thread(target=self._thread_worker_func)
+            def _run_worker_in_context():
+                self._thread_worker_func()
+            self.worker = threading.Thread(
+                target=contextvars.copy_context().run, args=(_run_worker_in_context,)
+            )
             self.worker.start()
         # For streaming
@@ -248,17 +256,24 @@ class StreamExecutor:
     def set_var(self, name, value):
         self.variables[name] = value
-    def get_meta_info(self, name):
+    def get_meta_info(self, name, timeout=None):
         if name in self.variable_event:
-            self.variable_event[name].wait()
+            got = self.variable_event[name].wait(timeout)
+            if not got:
+                raise TimeoutError(f"Timeout while waiting for event '{name}'")
         ret = self.meta_info.get(name, None)
         return ret
-    def fork(self, number: int, position_ids_offset: Optional[List[int]] = None):
-        self.submit(SglCommitLazy())
-        self.sync()
+    def fork(
+        self,
+        size: int = 1,
+        position_ids_offset: Optional[List[int]] = None,
+    ):
+        if size > 1:
+            self.submit(SglCommitLazy())
-        number = int(number)
+        self.sync()
+        size = int(size)
         exes = [
             StreamExecutor(
@@ -268,14 +283,15 @@ class StreamExecutor:
                 self.chat_template,
                 self.stream,
             )
-            for _ in range(number)
+            for _ in range(size)
         ]
-        for i in range(number):
+        for i in range(size):
             exes[i].variables = dict(self.variables)
             exes[i].text_ = str(self.text_)
             exes[i].messages_ = list(self.messages_)
             exes[i].cur_role = self.cur_role
             exes[i].fork_start_text_pos = len(self.text_)
+            exes[i].images_ = list(self.images_)
         return exes
@@ -454,15 +470,19 @@ class StreamExecutor:
             self.stream_var_event[name].set()
     def _execute_select(self, expr: SglSelect):
-        decision, normalized_prompt_logprob, prompt_logprob = self.backend.select(
-            self, expr.choices, expr.temperature
-        )
+        (
+            decision,
+            normalized_prompt_logprobs,
+            prefill_token_logprobs,
+            decode_token_logprobs,
+        ) = self.backend.select(self, expr.choices, expr.temperature)
         if expr.name is not None:
             name = expr.name
             self.variables[name] = decision
             self.meta_info[name] = {
-                "normalized_prompt_logprob": normalized_prompt_logprob,
-                "prompt_logprob": prompt_logprob,
+                "normalized_prompt_logprobs": normalized_prompt_logprobs,
+                "prefill_token_logprobs": prefill_token_logprobs,
+                "decode_token_logprobs": decode_token_logprobs,
             }
             self.variable_event[name].set()
         self.text_ += decision
@@ -634,8 +654,12 @@ class ProgramState:
         yield
         self.stream_executor.submit(SglVarScopeEnd(name))
-    def fork(self, number: int = 1, position_ids_offset: Optional[List[int]] = None):
-        stream_executors = self.stream_executor.fork(number, position_ids_offset)
+    def fork(
+        self,
+        size: int = 1,
+        position_ids_offset: Optional[List[int]] = None,
+    ):
+        stream_executors = self.stream_executor.fork(size, position_ids_offset)
         states = [ProgramState(x) for x in stream_executors]
         state_group = ProgramStateGroup(states, self)
         return state_group

sglang/lang/tracer.py CHANGED Viewed

@@ -109,19 +109,21 @@ class TracerProgramState(ProgramState):
     ########### Public API ###########
     ##################################
-    def fork(self, number: int, position_ids_offset: Optional[List[int]] = None):
+    def fork(self, size: int = 1, position_ids_offset: Optional[List[int]] = None):
+        assert (size >= 1)
         if self.only_trace_prefix:
             raise StopTracing()
-        fork_node = SglFork(number)
+        fork_node = SglFork(size)
         fork_node.prev_node = self.last_node
         states = [
             TracerProgramState(self.backend, self.arguments, self.only_trace_prefix)
-            for _ in range(number)
+            for _ in range(size)
         ]
-        for i in range(number):
+        for i in range(size):
             node = SglGetForkItem(i)
             node.prev_node = fork_node
             states[i].last_node = node

sglang/launch_server.py CHANGED Viewed

@@ -2,10 +2,11 @@ import argparse
 from sglang.srt.server import ServerArgs, launch_server
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ServerArgs.add_cli_args(parser)
     args = parser.parse_args()
     server_args = ServerArgs.from_cli_args(args)
-    launch_server(server_args, None)
+    launch_server(server_args, None)

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -7,6 +7,7 @@ class FSMCache(BaseCache):
         super().__init__(enable=enable)
         from importlib.metadata import version
         if version("outlines") >= "0.0.35":
             from transformers import AutoTokenizer

sglang/srt/constrained/jump_forward.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import interegular
 from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm
 from sglang.srt.constrained.base_cache import BaseCache

sglang/srt/conversation.py CHANGED Viewed

@@ -4,7 +4,7 @@ import dataclasses
 from enum import IntEnum, auto
 from typing import Dict, List, Optional, Tuple, Union
-from sglang.srt.managers.openai_protocol import ChatCompletionRequest
+from sglang.srt.openai_protocol import ChatCompletionRequest
 class SeparatorStyle(IntEnum):
@@ -400,7 +400,7 @@ register_conv_template(
     Conversation(
         name="chatml",
         system_template="<|im_start|>system\n{system_message}",
-        system_message="You are an AI assistant.",
+        system_message="You are a helpful assistant.",
         roles=("<|im_start|>user", "<|im_start|>assistant"),
         sep_style=SeparatorStyle.CHATML,
         sep="<|im_end|>",

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -6,7 +6,6 @@ import warnings
 from typing import List, Optional, Tuple, Union
 from huggingface_hub import snapshot_download
-from sglang.srt.utils import is_multimodal_model
 from transformers import (
     AutoConfig,
     AutoProcessor,
@@ -15,6 +14,8 @@ from transformers import (
     PreTrainedTokenizerFast,
 )
+from sglang.srt.utils import is_multimodal_model
 def download_from_hf(model_path: str):
     if os.path.exists(model_path):

sglang/srt/layers/context_flashattention_nopad.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import torch
 import triton
 import triton.language as tl
 from sglang.srt.utils import wrap_kernel_launcher
 CUDA_CAPABILITY = torch.cuda.get_device_capability()

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 import triton
 import triton.language as tl
 from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
 from sglang.srt.utils import wrap_kernel_launcher

sglang 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

sglang 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl