PyPI - sglang - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

sglang 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sglang/__init__.py +57 -2
sglang/api.py +8 -5
sglang/backend/anthropic.py +18 -4
sglang/backend/openai.py +2 -1
sglang/backend/runtime_endpoint.py +18 -5
sglang/backend/vertexai.py +1 -0
sglang/global_config.py +5 -1
sglang/lang/chat_template.py +83 -2
sglang/lang/interpreter.py +92 -35
sglang/lang/ir.py +12 -9
sglang/lang/tracer.py +6 -4
sglang/launch_server_llavavid.py +31 -0
sglang/srt/constrained/fsm_cache.py +1 -0
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/conversation.py +2 -2
sglang/srt/flush_cache.py +16 -0
sglang/srt/hf_transformers_utils.py +10 -2
sglang/srt/layers/context_flashattention_nopad.py +1 -0
sglang/srt/layers/extend_attention.py +1 -0
sglang/srt/layers/logits_processor.py +114 -54
sglang/srt/layers/radix_attention.py +2 -1
sglang/srt/layers/token_attention.py +1 -0
sglang/srt/managers/detokenizer_manager.py +5 -1
sglang/srt/managers/io_struct.py +27 -3
sglang/srt/managers/router/infer_batch.py +97 -48
sglang/srt/managers/router/manager.py +11 -8
sglang/srt/managers/router/model_rpc.py +169 -90
sglang/srt/managers/router/model_runner.py +110 -166
sglang/srt/managers/router/radix_cache.py +89 -51
sglang/srt/managers/router/scheduler.py +17 -28
sglang/srt/managers/tokenizer_manager.py +110 -33
sglang/srt/memory_pool.py +5 -14
sglang/srt/model_config.py +11 -0
sglang/srt/models/commandr.py +372 -0
sglang/srt/models/dbrx.py +412 -0
sglang/srt/models/dbrx_config.py +281 -0
sglang/srt/models/gemma.py +24 -25
sglang/srt/models/llama2.py +25 -26
sglang/srt/models/llava.py +8 -10
sglang/srt/models/llavavid.py +307 -0
sglang/srt/models/mixtral.py +29 -33
sglang/srt/models/qwen.py +34 -25
sglang/srt/models/qwen2.py +25 -26
sglang/srt/models/stablelm.py +26 -26
sglang/srt/models/yivl.py +3 -5
sglang/srt/openai_api_adapter.py +356 -0
sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
sglang/srt/sampling_params.py +2 -0
sglang/srt/server.py +91 -456
sglang/srt/server_args.py +79 -49
sglang/srt/utils.py +212 -47
sglang/srt/weight_utils.py +417 -0
sglang/test/test_programs.py +8 -7
sglang/test/test_utils.py +195 -7
sglang/utils.py +77 -26
{sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/METADATA +20 -18
sglang-0.1.16.dist-info/RECORD +72 -0
sglang-0.1.14.dist-info/RECORD +0 -64
{sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/LICENSE +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/WHEEL +0 -0
{sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,4 +1,59 @@
-__version__ = "0.1.14"
+__version__ = "0.1.16"
-from sglang.api import *
+# SGL API Components
+from sglang.api import (
+    Runtime,
+    assistant,
+    assistant_begin,
+    assistant_end,
+    flush_cache,
+    function,
+    gen,
+    gen_int,
+    gen_string,
+    get_server_args,
+    image,
+    select,
+    set_default_backend,
+    system,
+    user,
+    user_begin,
+    user_end,
+    video,
+)
+# SGL Backends
+from sglang.backend.anthropic import Anthropic
+from sglang.backend.openai import OpenAI
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.backend.vertexai import VertexAI
+# Global Configurations
 from sglang.global_config import global_config
+# public APIs management
+__all__ = [
+    "global_config",
+    "Anthropic",
+    "OpenAI",
+    "RuntimeEndpoint",
+    "VertexAI",
+    "function",
+    "Runtime",
+    "set_default_backend",
+    "flush_cache",
+    "get_server_args",
+    "gen",
+    "gen_int",
+    "gen_string",
+    "image",
+    "video",
+    "select",
+    "system",
+    "user",
+    "assistant",
+    "user_begin",
+    "user_end",
+    "assistant_begin",
+    "assistant_end",
+]

sglang/api.py CHANGED Viewed

@@ -1,13 +1,10 @@
-"""Public API"""
+"""Some Public API Definitions"""
+import os
 import re
 from typing import Callable, List, Optional, Union
-from sglang.backend.anthropic import Anthropic
 from sglang.backend.base_backend import BaseBackend
-from sglang.backend.openai import OpenAI
-from sglang.backend.runtime_endpoint import RuntimeEndpoint
-from sglang.backend.vertexai import VertexAI
 from sglang.global_config import global_config
 from sglang.lang.ir import (
     SglExpr,
@@ -18,6 +15,7 @@ from sglang.lang.ir import (
     SglRoleBegin,
     SglRoleEnd,
     SglSelect,
+    SglVideo,
 )
@@ -35,6 +33,7 @@ def function(
 def Runtime(*args, **kwargs):
     # Avoid importing unnecessary dependency
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
     from sglang.srt.server import Runtime
     return Runtime(*args, **kwargs)
@@ -153,6 +152,10 @@ def image(expr: SglExpr):
     return SglImage(expr)
+def video(path: str, num_frames: int):
+    return SglVideo(path, num_frames)
 def select(
     name: Optional[str] = None,
     choices: List[str] = None,

sglang/backend/anthropic.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import List, Optional, Union
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.interpreter import StreamExecutor
@@ -13,7 +14,7 @@ except ImportError as e:
 class Anthropic(BaseBackend):
-    def __init__(self, model_name):
+    def __init__(self, model_name, *args, **kwargs):
         super().__init__()
         if isinstance(anthropic, Exception):
@@ -21,6 +22,7 @@ class Anthropic(BaseBackend):
         self.model_name = model_name
         self.chat_template = get_chat_template("claude")
+        self.client = anthropic.Anthropic(*args, **kwargs)
     def get_chat_template(self):
         return self.chat_template
@@ -35,8 +37,14 @@ class Anthropic(BaseBackend):
         else:
             messages = [{"role": "user", "content": s.text_}]
-        ret = anthropic.Anthropic().messages.create(
+        if messages and messages[0]["role"] == "system":
+            system = messages.pop(0)["content"]
+        else:
+            system = ""
+        ret = self.client.messages.create(
             model=self.model_name,
+            system=system,
             messages=messages,
             **sampling_params.to_anthropic_kwargs(),
         )
@@ -54,10 +62,16 @@ class Anthropic(BaseBackend):
         else:
             messages = [{"role": "user", "content": s.text_}]
-        with anthropic.Anthropic().messages.stream(
+        if messages and messages[0]["role"] == "system":
+            system = messages.pop(0)["content"]
+        else:
+            system = ""
+        with self.client.messages.stream(
             model=self.model_name,
+            system=system,
             messages=messages,
             **sampling_params.to_anthropic_kwargs(),
         ) as stream:
             for text in stream.text_stream:
-                yield text, {}
+                yield text, {}

sglang/backend/openai.py CHANGED Viewed

@@ -3,6 +3,7 @@ import time
 from typing import Callable, List, Optional, Union
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
 from sglang.lang.interpreter import StreamExecutor
@@ -227,7 +228,7 @@ class OpenAI(BaseBackend):
             prompt_tokens.append(ret_token)
         decision = choices[np.argmax(scores)]
-        return decision, scores, scores
+        return decision, scores, None, None
 def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):

sglang/backend/runtime_endpoint.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Callable, List, Optional, Union
 import numpy as np
 import requests
 from sglang.backend.base_backend import BaseBackend
 from sglang.global_config import global_config
 from sglang.lang.chat_template import get_chat_template_by_model_path
@@ -73,9 +74,11 @@ class RuntimeEndpoint(BaseBackend):
         assert res.status_code == 200
     def commit_lazy_operations(self, s: StreamExecutor):
+        data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
+        self._add_images(s, data)
         res = http_request(
             self.base_url + "/generate",
-            json={"text": s.text_, "sampling_params": {"max_new_tokens": 0}},
+            json=data,
             auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
@@ -104,6 +107,7 @@ class RuntimeEndpoint(BaseBackend):
                 "text": s.text_,
                 "sampling_params": {
                     "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
                     **sampling_params.to_srt_kwargs(),
                 },
             }
@@ -112,6 +116,7 @@ class RuntimeEndpoint(BaseBackend):
                 "text": s.text_,
                 "sampling_params": {
                     "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
                     "dtype": "int",
                     **sampling_params.to_srt_kwargs(),
                 },
@@ -142,6 +147,7 @@ class RuntimeEndpoint(BaseBackend):
                 "text": s.text_,
                 "sampling_params": {
                     "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
                     **sampling_params.to_srt_kwargs(),
                 },
             }
@@ -150,6 +156,7 @@ class RuntimeEndpoint(BaseBackend):
                 "text": s.text_,
                 "sampling_params": {
                     "skip_special_tokens": global_config.skip_special_tokens_in_output,
+                    "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
                     "dtype": "int",
                     **sampling_params.to_srt_kwargs(),
                 },
@@ -224,13 +231,19 @@ class RuntimeEndpoint(BaseBackend):
         )
         assert res.status_code == 200
         obj = res.json()
-        normalized_prompt_logprob = [
+        normalized_prompt_logprobs = [
             r["meta_info"]["normalized_prompt_logprob"] for r in obj
         ]
-        prompt_logprob = [r["meta_info"]["prompt_logprob"] for r in obj]
+        decision = choices[np.argmax(normalized_prompt_logprobs)]
+        prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
+        decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
-        decision = choices[np.argmax(normalized_prompt_logprob)]
-        return decision, normalized_prompt_logprob, prompt_logprob
+        return (
+            decision,
+            normalized_prompt_logprobs,
+            prefill_token_logprobs,
+            decode_token_logprobs,
+        )
     def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
         res = http_request(

sglang/backend/vertexai.py CHANGED Viewed

@@ -3,6 +3,7 @@ import warnings
 from typing import List, Optional, Union
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.interpreter import StreamExecutor

sglang/global_config.py CHANGED Viewed

@@ -12,10 +12,11 @@ class GlobalConfig:
         # Output configs
         self.skip_special_tokens_in_output = True
+        self.spaces_between_special_tokens_in_out = True
         # Optimization configs
         self.eager_fill_image = False
-        self.enable_prefix_sharing = True
+        self.enable_precache_with_tracing = True
         self.enable_parallel_encoding = True
         self.enable_parallel_decoding = True
@@ -24,5 +25,8 @@ class GlobalConfig:
         # adjust_cache: Adjust the position embedding of KV cache.
         self.concate_and_append_mode = "no_adjust"
+        # Request dependency time due to network delay
+        self.request_dependency_time = 0.03
 global_config = GlobalConfig()

sglang/lang/chat_template.py CHANGED Viewed

@@ -162,6 +162,28 @@ register_chat_template(
     )
 )
+register_chat_template(
+    ChatTemplate(
+        name="llama-3-instruct",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|start_header_id|>system<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "user": (
+                "<|start_header_id|>user<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+            "assistant": (
+                "<|start_header_id|>assistant<|end_header_id|>\n\n",
+                "<|eot_id|>",
+            ),
+        },
+        stop_str=("<|eot_id|>",),
+    )
+)
 # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
 register_chat_template(
     ChatTemplate(
@@ -192,6 +214,44 @@ register_chat_template(
     )
 )
+register_chat_template(
+    ChatTemplate(
+        name="dbrx-instruct",
+        default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>"),
+            "user": ("\n<|im_start|>user\n", "<|im_end|>"),
+            "assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
+        },
+        stop_str=("<|im_end|>",),
+    )
+)
+register_chat_template(
+    ChatTemplate(
+        name="c4ai-command-r",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": (
+                "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
+                "<|END_OF_TURN_TOKEN|>",
+            ),
+            "user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
+            "assistant": (
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+                "<|END_OF_TURN_TOKEN|>",
+            ),
+        },
+        style=ChatTemplateStyle.PLAIN,
+    )
+)
+@register_chat_template_matching_function
+def match_dbrx(model_path: str):
+    if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
+        return get_chat_template("dbrx-instruct")
 @register_chat_template_matching_function
 def match_vicuna(model_path: str):
@@ -199,6 +259,8 @@ def match_vicuna(model_path: str):
         return get_chat_template("vicuna_v1.1")
     if "llava-v1.5" in model_path.lower():
         return get_chat_template("vicuna_v1.1")
+    if "llava-next-video-7b" in model_path.lower():
+        return get_chat_template("vicuna_v1.1")
 @register_chat_template_matching_function
@@ -214,21 +276,33 @@ def match_llama2_chat(model_path: str):
         return get_chat_template("llama-2-chat")
+@register_chat_template_matching_function
+def match_llama3_instruct(model_path: str):
+    model_path = model_path.lower()
+    if "llama-3" in model_path and "instruct" in model_path:
+        return get_chat_template("llama-3-instruct")
 @register_chat_template_matching_function
 def match_chat_ml(model_path: str):
+    # import pdb;pdb.set_trace()
     model_path = model_path.lower()
     if "tinyllama" in model_path:
         return get_chat_template("chatml")
     if "qwen" in model_path and "chat" in model_path:
         return get_chat_template("chatml")
-    if "llava-v1.6-34b" in model_path:
+    if (
+        "llava-v1.6-34b" in model_path
+        or "llava-v1.6-yi-34b" in model_path
+        or "llava-next-video-34b" in model_path
+    ):
         return get_chat_template("chatml-llava")
 @register_chat_template_matching_function
 def match_chat_yi(model_path: str):
     model_path = model_path.lower()
-    if "yi" in model_path:
+    if "yi" in model_path and "llava" not in model_path:
         return get_chat_template("yi")
@@ -239,6 +313,13 @@ def match_gemma_it(model_path: str):
         return get_chat_template("gemma-it")
+@register_chat_template_matching_function
+def match_c4ai_command_r(model_path: str):
+    model_path = model_path.lower()
+    if "c4ai-command-r" in model_path:
+        return get_chat_template("c4ai-command-r")
 if __name__ == "__main__":
     messages = [
         {"role": "system", "content": None},  # None means default

sglang/lang/interpreter.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """The interpreter that executes SGL programs"""
 import asyncio
+import contextvars
 import multiprocessing
 import queue
 import threading
@@ -10,6 +11,7 @@ from contextlib import contextmanager
 from typing import Any, Callable, Dict, List, Optional, Union
 import tqdm
 from sglang.global_config import global_config
 from sglang.lang.ir import (
     SglCommitLazy,
@@ -26,8 +28,9 @@ from sglang.lang.ir import (
     SglVariable,
     SglVarScopeBegin,
     SglVarScopeEnd,
+    SglVideo,
 )
-from sglang.utils import encode_image_base64
+from sglang.utils import encode_image_base64, encode_video_base64, get_exception_traceback
 def run_internal(state, program, func_args, func_kwargs, sync):
@@ -84,9 +87,9 @@ def run_program_batch(
     if hasattr(backend, "endpoint"):
         backend = backend.endpoint
-    # Extract prefix by tracing and cache it
-    if len(batch_arguments) > 1:
-        pin_program(program, backend)
+    # Pre-cache the common prefix for a batch. The prefix is extracted by tracing the program.
+    if global_config.enable_precache_with_tracing and len(batch_arguments) > 1:
+        cache_program(program, backend)
     # Run all programs
     if num_threads == "auto":
@@ -152,21 +155,12 @@ def run_program_batch(
     return rets
-def pin_program(program, backend):
-    if global_config.enable_prefix_sharing and program.pin_prefix_rid is None:
-        # TODO: handle multiple backends
-        from sglang.lang.tracer import extract_prefix_by_tracing
-        prefix = extract_prefix_by_tracing(program, backend)
-        if prefix and len(prefix) > 64:
-            prefix_rid = backend.cache_prefix(prefix)
-            program.pin_prefix_rid = prefix_rid
-            return prefix_rid
-    return None
+def cache_program(program, backend):
+    from sglang.lang.tracer import extract_prefix_by_tracing
-def unpin_program(program, backend):
-    pass
+    prefix = extract_prefix_by_tracing(program, backend)
+    if prefix and len(prefix) > 64:
+        backend.cache_prefix(prefix)
 class StreamExecutor:
@@ -193,6 +187,7 @@ class StreamExecutor:
         self.variable_event = {}  # Dict[name: str -> event: threading.Event]
         self.meta_info = {}  # Dict[name: str -> info: str]
         self.is_finished = False
+        self.error = None
         # For completion
         self.text_ = ""  # The full text
@@ -217,7 +212,13 @@ class StreamExecutor:
         self.use_thread = use_thread
         if self.use_thread:
             self.queue = queue.Queue()
-            self.worker = threading.Thread(target=self._thread_worker_func)
+            def _run_worker_in_context():
+                self._thread_worker_func()
+            self.worker = threading.Thread(
+                target=contextvars.copy_context().run, args=(_run_worker_in_context,)
+            )
             self.worker.start()
         # For streaming
@@ -248,17 +249,24 @@ class StreamExecutor:
     def set_var(self, name, value):
         self.variables[name] = value
-    def get_meta_info(self, name):
+    def get_meta_info(self, name, timeout=None):
         if name in self.variable_event:
-            self.variable_event[name].wait()
+            got = self.variable_event[name].wait(timeout)
+            if not got:
+                raise TimeoutError(f"Timeout while waiting for event '{name}'")
         ret = self.meta_info.get(name, None)
         return ret
-    def fork(self, number: int, position_ids_offset: Optional[List[int]] = None):
-        self.submit(SglCommitLazy())
-        self.sync()
+    def fork(
+        self,
+        size: int = 1,
+        position_ids_offset: Optional[List[int]] = None,
+    ):
+        if size > 1:
+            self.submit(SglCommitLazy())
-        number = int(number)
+        self.sync()
+        size = int(size)
         exes = [
             StreamExecutor(
@@ -268,14 +276,15 @@ class StreamExecutor:
                 self.chat_template,
                 self.stream,
             )
-            for _ in range(number)
+            for _ in range(size)
         ]
-        for i in range(number):
+        for i in range(size):
             exes[i].variables = dict(self.variables)
             exes[i].text_ = str(self.text_)
             exes[i].messages_ = list(self.messages_)
             exes[i].cur_role = self.cur_role
             exes[i].fork_start_text_pos = len(self.text_)
+            exes[i].images_ = list(self.images_)
         return exes
@@ -294,17 +303,39 @@ class StreamExecutor:
         self.backend.end_program(self)
     def _thread_worker_func(self):
+        error = None
         while True:
             expr = self.queue.get()
             if expr is None:
                 self.queue.task_done()
                 break
-            self._execute(expr)
+            try:
+                self._execute(expr)
+            except Exception as e:
+                # print(f"Error in stream_executor: {get_exception_traceback()}")
+                error = e
+                break
             self.queue.task_done()
             if self.stream_text_event:
                 self.stream_text_event.set()
+        # Clean the queue and events
+        if error is not None:
+            try:
+                while True:
+                    self.queue.task_done()
+                    self.queue.get_nowait()
+            except queue.Empty:
+                pass
+            for name in self.variable_event:
+                self.variable_event[name].set()
+            if self.stream_var_event:
+                for name in self.stream_var_event:
+                    self.stream_var_event[name].set()
+            self.error = error
         if self.stream_text_event:
             self.stream_text_event.set()
@@ -331,6 +362,8 @@ class StreamExecutor:
             self._execute_role_end(other)
         elif isinstance(other, SglImage):
             self._execute_image(other)
+        elif isinstance(other, SglVideo):
+            self._execute_video(other)
         elif isinstance(other, SglVariable):
             self._execute_variable(other)
         elif isinstance(other, SglVarScopeBegin):
@@ -367,6 +400,16 @@ class StreamExecutor:
         self.cur_images.append((path, base64_data))
         self.text_ += self.chat_template.image_token
+    def _execute_video(self, expr: SglVideo):
+        path = expr.path
+        num_frames = expr.num_frames
+        base64_data = encode_video_base64(path, num_frames)
+        self.images_.append((path, base64_data))
+        self.cur_images.append((path, base64_data))
+        self.text_ += self.chat_template.image_token
         # if global_config.eager_fill_image:
         #     self.backend.fill_image(self)
@@ -454,15 +497,19 @@ class StreamExecutor:
             self.stream_var_event[name].set()
     def _execute_select(self, expr: SglSelect):
-        decision, normalized_prompt_logprob, prompt_logprob = self.backend.select(
-            self, expr.choices, expr.temperature
-        )
+        (
+            decision,
+            normalized_prompt_logprobs,
+            prefill_token_logprobs,
+            decode_token_logprobs,
+        ) = self.backend.select(self, expr.choices, expr.temperature)
         if expr.name is not None:
             name = expr.name
             self.variables[name] = decision
             self.meta_info[name] = {
-                "normalized_prompt_logprob": normalized_prompt_logprob,
-                "prompt_logprob": prompt_logprob,
+                "normalized_prompt_logprobs": normalized_prompt_logprobs,
+                "prefill_token_logprobs": prefill_token_logprobs,
+                "decode_token_logprobs": decode_token_logprobs,
             }
             self.variable_event[name].set()
         self.text_ += decision
@@ -634,8 +681,12 @@ class ProgramState:
         yield
         self.stream_executor.submit(SglVarScopeEnd(name))
-    def fork(self, number: int = 1, position_ids_offset: Optional[List[int]] = None):
-        stream_executors = self.stream_executor.fork(number, position_ids_offset)
+    def fork(
+        self,
+        size: int = 1,
+        position_ids_offset: Optional[List[int]] = None,
+    ):
+        stream_executors = self.stream_executor.fork(size, position_ids_offset)
         states = [ProgramState(x) for x in stream_executors]
         state_group = ProgramStateGroup(states, self)
         return state_group
@@ -657,6 +708,9 @@ class ProgramState:
     def sync(self):
         return self.stream_executor.sync()
+    def error(self):
+        return self.stream_executor.error
     def text_iter(self, var_name: Optional[str] = None):
         if self.stream_executor.stream:
             prev = 0
@@ -745,6 +799,9 @@ class ProgramState:
     def __setitem__(self, name, value):
         self.set_var(name, value)
+    def __contains__(self, name):
+        return name in self.stream_executor.variables
     def __del__(self):
         self.stream_executor.end()

sglang 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

sglang 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl