PyPI - sglang - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl - Mend

sglang 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

sglang/__init__.py +33 -26
sglang/api.py +9 -1
sglang/bench_latency.py +2 -2
sglang/bench_serving.py +10 -1
sglang/check_env.py +1 -1
sglang/lang/backend/litellm.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/interpreter.py +21 -5
sglang/lang/ir.py +1 -2
sglang/srt/constrained/__init__.py +15 -0
sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +17 -2
sglang/srt/constrained/fsm_cache.py +17 -2
sglang/srt/constrained/jump_forward.py +17 -2
sglang/srt/conversation.py +26 -0
sglang/srt/hf_transformers_utils.py +15 -0
sglang/srt/layers/context_flashattention_nopad.py +15 -0
sglang/srt/layers/extend_attention.py +15 -0
sglang/srt/layers/fused_moe.py +15 -0
sglang/srt/layers/linear.py +15 -0
sglang/srt/layers/logits_processor.py +41 -13
sglang/srt/layers/quantization/__init__.py +15 -0
sglang/srt/layers/quantization/fp8.py +15 -0
sglang/srt/layers/radix_attention.py +17 -2
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
sglang/srt/managers/detokenizer_manager.py +16 -1
sglang/srt/managers/io_struct.py +36 -3
sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +60 -21
sglang/srt/managers/tokenizer_manager.py +39 -16
sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +159 -46
sglang/srt/mem_cache/base_cache.py +43 -0
sglang/srt/mem_cache/chunk_cache.py +60 -0
sglang/srt/mem_cache/flush_cache.py +33 -0
sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
sglang/srt/{managers/controller → mem_cache}/radix_cache.py +20 -2
sglang/srt/mm_utils.py +15 -0
sglang/srt/model_config.py +15 -0
sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
sglang/srt/{managers/controller → model_executor}/model_runner.py +49 -14
sglang/srt/model_loader/model_loader.py +15 -0
sglang/srt/model_loader/utils.py +16 -1
sglang/srt/models/chatglm.py +16 -1
sglang/srt/models/commandr.py +16 -1
sglang/srt/models/dbrx.py +16 -1
sglang/srt/models/deepseek.py +16 -1
sglang/srt/models/deepseek_v2.py +16 -1
sglang/srt/models/gemma.py +16 -1
sglang/srt/models/gemma2.py +16 -1
sglang/srt/models/gpt_bigcode.py +16 -1
sglang/srt/models/grok.py +16 -1
sglang/srt/models/internlm2.py +16 -1
sglang/srt/models/llama2.py +21 -22
sglang/srt/models/llama_classification.py +16 -1
sglang/srt/models/llava.py +17 -2
sglang/srt/models/llavavid.py +17 -2
sglang/srt/models/minicpm.py +16 -1
sglang/srt/models/mistral.py +15 -0
sglang/srt/models/mixtral.py +16 -1
sglang/srt/models/mixtral_quant.py +16 -1
sglang/srt/models/qwen.py +16 -1
sglang/srt/models/qwen2.py +16 -1
sglang/srt/models/qwen2_moe.py +16 -1
sglang/srt/models/stablelm.py +16 -1
sglang/srt/models/yivl.py +15 -0
sglang/srt/openai_api/adapter.py +569 -131
sglang/srt/openai_api/protocol.py +84 -2
sglang/srt/sampling_params.py +15 -0
sglang/srt/server.py +92 -23
sglang/srt/server_args.py +52 -11
sglang/srt/utils.py +15 -0
sglang/test/test_programs.py +9 -6
sglang/utils.py +22 -0
sglang/version.py +1 -1
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA +33 -7
sglang-0.2.8.dist-info/RECORD +95 -0
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/WHEEL +1 -1
sglang/srt/flush_cache.py +0 -18
sglang-0.2.6.dist-info/RECORD +0 -93
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Pydantic models for OpenAI API protocol"""
 import time
@@ -39,12 +54,79 @@ class LogProbs(BaseModel):
     top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
+class TopLogprob(BaseModel):
+    token: str
+    bytes: List[int]
+    logprob: float
+class ChatCompletionTokenLogprob(BaseModel):
+    token: str
+    bytes: List[int]
+    logprob: float
+    top_logprobs: List[TopLogprob]
+class ChoiceLogprobs(BaseModel):
+    # build for v1/chat/completions response
+    content: List[ChatCompletionTokenLogprob]
 class UsageInfo(BaseModel):
     prompt_tokens: int = 0
     total_tokens: int = 0
     completion_tokens: Optional[int] = 0
+class FileRequest(BaseModel):
+    # https://platform.openai.com/docs/api-reference/files/create
+    file: bytes  # The File object (not file name) to be uploaded
+    purpose: str = (
+        "batch"  # The intended purpose of the uploaded file, default is "batch"
+    )
+class FileResponse(BaseModel):
+    id: str
+    object: str = "file"
+    bytes: int
+    created_at: int
+    filename: str
+    purpose: str
+class BatchRequest(BaseModel):
+    input_file_id: (
+        str  # The ID of an uploaded file that contains requests for the new batch
+    )
+    endpoint: str  # The endpoint to be used for all requests in the batch
+    completion_window: str  # The time frame within which the batch should be processed
+    metadata: Optional[dict] = None  # Optional custom metadata for the batch
+class BatchResponse(BaseModel):
+    id: str
+    object: str = "batch"
+    endpoint: str
+    errors: Optional[dict] = None
+    input_file_id: str
+    completion_window: str
+    status: str = "validating"
+    output_file_id: Optional[str] = None
+    error_file_id: Optional[str] = None
+    created_at: int
+    in_progress_at: Optional[int] = None
+    expires_at: Optional[int] = None
+    finalizing_at: Optional[int] = None
+    completed_at: Optional[int] = None
+    failed_at: Optional[int] = None
+    expired_at: Optional[int] = None
+    cancelling_at: Optional[int] = None
+    cancelled_at: Optional[int] = None
+    request_counts: dict = {"total": 0, "completed": 0, "failed": 0}
+    metadata: Optional[dict] = None
 class CompletionRequest(BaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
@@ -175,8 +257,8 @@ class ChatMessage(BaseModel):
 class ChatCompletionResponseChoice(BaseModel):
     index: int
     message: ChatMessage
-    logprobs: Optional[LogProbs] = None
-    finish_reason: Optional[str] = None
+    logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
+    finish_reason: str
 class ChatCompletionResponse(BaseModel):

sglang/srt/sampling_params.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Sampling parameters for text generation."""
 from typing import List, Optional, Union

sglang/srt/server.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 The entry point of inference server.
 SRT = SGLang Runtime.
@@ -23,17 +38,17 @@ import psutil
 import requests
 import uvicorn
 import uvloop
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, File, Form, Request, UploadFile
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.constrained import disable_cache
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.controller.manager_multi import (
+from sglang.srt.managers.controller_multi import (
     start_controller_process as start_controller_process_multi,
 )
-from sglang.srt.managers.controller.manager_single import launch_tp_servers
-from sglang.srt.managers.controller.manager_single import (
+from sglang.srt.managers.controller_single import launch_tp_servers
+from sglang.srt.managers.controller_single import (
     start_controller_process as start_controller_process_single,
 )
 from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
@@ -41,8 +56,13 @@ from sglang.srt.managers.io_struct import GenerateReqInput
 from sglang.srt.managers.tokenizer_manager import TokenizerManager
 from sglang.srt.openai_api.adapter import (
     load_chat_template_for_openai_api,
+    v1_batches,
     v1_chat_completions,
     v1_completions,
+    v1_files_create,
+    v1_retrieve_batch,
+    v1_retrieve_file,
+    v1_retrieve_file_content,
 )
 from sglang.srt.openai_api.protocol import ModelCard, ModelList
 from sglang.srt.server_args import PortArgs, ServerArgs
@@ -137,6 +157,35 @@ async def openai_v1_chat_completions(raw_request: Request):
     return await v1_chat_completions(tokenizer_manager, raw_request)
+@app.post("/v1/files")
+async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
+    return await v1_files_create(
+        file, purpose, tokenizer_manager.server_args.file_storage_pth
+    )
+@app.post("/v1/batches")
+async def openai_v1_batches(raw_request: Request):
+    return await v1_batches(tokenizer_manager, raw_request)
+@app.get("/v1/batches/{batch_id}")
+async def retrieve_batch(batch_id: str):
+    return await v1_retrieve_batch(batch_id)
+@app.get("/v1/files/{file_id}")
+async def retrieve_file(file_id: str):
+    # https://platform.openai.com/docs/api-reference/files/retrieve
+    return await v1_retrieve_file(file_id)
+@app.get("/v1/files/{file_id}/content")
+async def retrieve_file_content(file_id: str):
+    # https://platform.openai.com/docs/api-reference/files/retrieve-contents
+    return await v1_retrieve_file_content(file_id)
 @app.get("/v1/models")
 def available_models():
     """Show available models."""
@@ -160,6 +209,39 @@ def _set_torch_compile_config():
     torch._dynamo.config.accumulated_cache_size_limit = 256
+def set_envs_and_config(server_args: ServerArgs):
+    # Set global environments
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+    os.environ["NCCL_CUMEM_ENABLE"] = "0"
+    os.environ["NCCL_NVLS_ENABLE"] = "0"
+    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+    # Set ulimit
+    set_ulimit()
+    # Enable show time cost for debugging
+    if server_args.show_time_cost:
+        enable_show_time_cost()
+    # Disable disk cache
+    if server_args.disable_disk_cache:
+        disable_cache()
+    # Fix triton bugs
+    if server_args.tp_size * server_args.dp_size > 1:
+        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
+        maybe_set_triton_cache_manager()
+    # Set torch compile config
+    if server_args.enable_torch_compile:
+        _set_torch_compile_config()
+    # Set global chat template
+    if server_args.chat_template:
+        # TODO: replace this with huggingface transformers template
+        load_chat_template_for_openai_api(server_args.chat_template)
 def launch_server(
     server_args: ServerArgs,
     model_overide_args: Optional[dict] = None,
@@ -175,32 +257,16 @@ def launch_server(
         format="%(message)s",
     )
-    # Set global environments
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
-    os.environ["NCCL_CUMEM_ENABLE"] = "0"
-    os.environ["NCCL_NVLS_ENABLE"] = "0"
-    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-    set_ulimit()
-    if server_args.show_time_cost:
-        enable_show_time_cost()
-    if server_args.disable_disk_cache:
-        disable_cache()
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.1",
+            "0.1.3",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",
         )
-    if server_args.tp_size * server_args.dp_size > 1:
-        # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
-        maybe_set_triton_cache_manager()
-    if server_args.chat_template:
-        # TODO: replace this with huggingface transformers template
-        load_chat_template_for_openai_api(server_args.chat_template)
-    if server_args.enable_torch_compile:
-        _set_torch_compile_config()
+    set_envs_and_config(server_args)
     # Allocate ports
     server_args.port, server_args.additional_ports = allocate_init_ports(
@@ -413,6 +479,9 @@ class Runtime:
             parent.wait(timeout=5)
             self.pid = None
+    def cache_prefix(self, prefix: str):
+        self.endpoint.cache_prefix(prefix)
     def get_tokenizer(self):
         return get_tokenizer(
             self.server_args.tokenizer_path,

sglang/srt/server_args.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """The arguments of the server."""
 import argparse
@@ -29,7 +44,8 @@ class ServerArgs:
     max_prefill_tokens: Optional[int] = None
     max_running_requests: Optional[int] = None
     max_num_reqs: Optional[int] = None
-    schedule_heuristic: str = "lpm"
+    max_total_tokens: Optional[int] = None
+    schedule_policy: str = "lpm"
     schedule_conservativeness: float = 1.0
     # Other runtime options
@@ -45,11 +61,15 @@ class ServerArgs:
     # Other
     api_key: str = ""
+    file_storage_pth: str = "SGlang_storage"
     # Data parallelism
     dp_size: int = 1
     load_balance_method: str = "round_robin"
+    # Chunked Prefill
+    chunked_prefill_size: Optional[int] = None
     # Optimization/debug options
     disable_flashinfer: bool = False
     disable_flashinfer_sampling: bool = False
@@ -72,15 +92,15 @@ class ServerArgs:
             self.tokenizer_path = self.model_path
         if self.mem_fraction_static is None:
             if self.tp_size >= 16:
-                self.mem_fraction_static = 0.80
+                self.mem_fraction_static = 0.79
             elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.84
+                self.mem_fraction_static = 0.83
             elif self.tp_size >= 4:
-                self.mem_fraction_static = 0.86
+                self.mem_fraction_static = 0.85
             elif self.tp_size >= 2:
-                self.mem_fraction_static = 0.88
+                self.mem_fraction_static = 0.87
             else:
-                self.mem_fraction_static = 0.89
+                self.mem_fraction_static = 0.88
         if isinstance(self.additional_ports, int):
             self.additional_ports = [self.additional_ports]
         elif self.additional_ports is None:
@@ -176,6 +196,7 @@ class ServerArgs:
                 "gptq",
                 "marlin",
                 "gptq_marlin",
+                "awq_marlin",
                 "squeezellm",
                 "bitsandbytes",
             ],
@@ -208,15 +229,21 @@ class ServerArgs:
         parser.add_argument(
             "--max-num-reqs",
             type=int,
-            default=None,
+            default=ServerArgs.max_num_reqs,
             help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
         )
         parser.add_argument(
-            "--schedule-heuristic",
+            "--max-total-tokens",
+            type=int,
+            default=ServerArgs.max_total_tokens,
+            help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
+        )
+        parser.add_argument(
+            "--schedule-policy",
             type=str,
-            default=ServerArgs.schedule_heuristic,
+            default=ServerArgs.schedule_policy,
             choices=["lpm", "random", "fcfs", "dfs-weight"],
-            help="The scheduling heuristic.",
+            help="The scheduling policy of the requests.",
         )
         parser.add_argument(
             "--schedule-conservativeness",
@@ -270,6 +297,12 @@ class ServerArgs:
             default=ServerArgs.api_key,
             help="Set API key of the server.",
         )
+        parser.add_argument(
+            "--file-storage-pth",
+            type=str,
+            default=ServerArgs.file_storage_pth,
+            help="The path of the file storage in backend.",
+        )
         # Data parallelism
         parser.add_argument(
@@ -296,10 +329,18 @@ class ServerArgs:
             help="The nccl init address of multi-node server.",
         )
         parser.add_argument(
-            "--nnodes", type=int, default=1, help="The number of nodes."
+            "--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
         )
         parser.add_argument("--node-rank", type=int, help="The node rank.")
+        # Chunked prefill
+        parser.add_argument(
+            "--chunked-prefill-size",
+            type=int,
+            default=ServerArgs.chunked_prefill_size,
+            help="The size of the chunked prefill.",
+        )
         # Optimization/debug options
         parser.add_argument(
             "--disable-flashinfer",

sglang/srt/utils.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Common utilities."""
 import base64

sglang/test/test_programs.py CHANGED Viewed

@@ -113,15 +113,14 @@ def test_decode_json_regex():
             s += '  "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
             s += '  "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
             s += '  "latitude": ' + sgl.gen(regex=REGEX_FLOAT + ",") + "\n"
-            s += '  "country": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n"
-            s += '  "timezone": ' + sgl.gen(regex=REGEX_STRING) + "\n"
+            s += '  "country": ' + sgl.gen(regex=REGEX_STRING) + "\n"
             s += "}"
-    ret = decode_json.run()
+    ret = decode_json.run(temperature=0.0)
     try:
         js_obj = json.loads(ret["json_output"])
     except json.decoder.JSONDecodeError:
-        print(ret["json_output"])
+        print("JSONDecodeError", ret["json_output"])
         raise
     assert isinstance(js_obj["name"], str)
     assert isinstance(js_obj["population"], int)
@@ -141,8 +140,12 @@ def test_decode_json():
             s += '  "timezone": ' + sgl.gen(dtype=str) + "\n"
             s += "}"
-    ret = decode_json.run()
-    js_obj = json.loads(ret["json_output"])
+    ret = decode_json.run(max_new_tokens=64)
+    try:
+        js_obj = json.loads(ret["json_output"])
+    except json.decoder.JSONDecodeError:
+        print("JSONDecodeError", ret["json_output"])
+        raise
     assert isinstance(js_obj["name"], str)
     assert isinstance(js_obj["population"], int)

sglang/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Common utilities."""
 import base64
+import importlib
 import json
 import logging
 import signal
@@ -261,3 +262,24 @@ def graceful_registry(sub_module_name):
             logger.info(f"{sub_module_name} recive sigterm")
     signal.signal(signal.SIGTERM, graceful_shutdown)
+class LazyImport:
+    def __init__(self, module_name, class_name):
+        self.module_name = module_name
+        self.class_name = class_name
+        self._module = None
+    def _load(self):
+        if self._module is None:
+            module = importlib.import_module(self.module_name)
+            self._module = getattr(module, self.class_name)
+        return self._module
+    def __getattr__(self, name):
+        module = self._load()
+        return getattr(module, name)
+    def __call__(self, *args, **kwargs):
+        module = self._load()
+        return module(*args, **kwargs)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.6"
1	+ __version__ = "0.2.8"

{sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.6
+Version: 0.2.8
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -245,6 +245,13 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
 <div align="center">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
+[![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
+![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
+[![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
+[![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
 </div>
 --------------------------------------------------------------------------------
@@ -292,7 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ### Method 2: From source
 ```
-git clone https://github.com/sgl-project/sglang.git
+# Use the stable v0.2.8 branch
+git clone -b v0.2.8 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -304,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ### Method 3: Using docker
 The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
-Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
+Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
 ```bash
 docker run --gpus all \
@@ -341,7 +349,7 @@ curl http://localhost:30000/generate \
     }
   }'
 ```
-Learn more about the argument format [here](docs/sampling_params.md).
+Learn more about the argument format [here](docs/en/sampling_params.md).
 ### OpenAI Compatible API
 In addition, the server supports OpenAI-compatible APIs.
@@ -388,7 +396,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
 ```
-- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
+- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
 - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
 # Node 0
@@ -397,7 +405,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 # Node 1
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
 ```
-- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
+- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
 - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
@@ -440,7 +448,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
 - InternLM 2
 - Mistral NeMo
-Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
+Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
 ### Benchmark Performance
@@ -671,6 +679,24 @@ for out in state.text_iter():
     print(out, end="", flush=True)
 ```
+#### Roles
+Use `sgl.system`， `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
+```python
+@sgl.function
+def chat_example(s):
+    s += sgl.system("You are a helpful assistant.")
+    # Same as: s += s.system("You are a helpful assistant.")
+    with s.user():
+        s += "Question: What is the capital of France?"
+    s += sgl.assistant_begin()
+    s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
+    s += sgl.assistant_end()
+```
 #### Tips and Implementation Details
 - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
 - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.

sglang 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

sglang 0.2.6py3-none-any.whl → 0.2.8py3-none-any.whl