PyPI - sglang - Versions diffs - 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl - Mend

sglang 0.1.17py3-none-any.whl → 0.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

sglang/__init__.py +2 -2
sglang/api.py +30 -4
sglang/backend/litellm.py +2 -2
sglang/backend/openai.py +26 -15
sglang/backend/runtime_endpoint.py +18 -14
sglang/bench_latency.py +317 -0
sglang/global_config.py +5 -1
sglang/lang/chat_template.py +41 -6
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +6 -2
sglang/lang/ir.py +74 -28
sglang/launch_server.py +4 -1
sglang/launch_server_llavavid.py +2 -1
sglang/srt/constrained/__init__.py +14 -6
sglang/srt/constrained/fsm_cache.py +6 -3
sglang/srt/constrained/jump_forward.py +113 -25
sglang/srt/conversation.py +2 -0
sglang/srt/flush_cache.py +2 -0
sglang/srt/hf_transformers_utils.py +68 -9
sglang/srt/layers/extend_attention.py +2 -1
sglang/srt/layers/fused_moe.py +280 -169
sglang/srt/layers/logits_processor.py +106 -42
sglang/srt/layers/radix_attention.py +53 -29
sglang/srt/layers/token_attention.py +4 -1
sglang/srt/managers/controller/dp_worker.py +6 -3
sglang/srt/managers/controller/infer_batch.py +144 -69
sglang/srt/managers/controller/manager_multi.py +5 -5
sglang/srt/managers/controller/manager_single.py +9 -4
sglang/srt/managers/controller/model_runner.py +167 -55
sglang/srt/managers/controller/radix_cache.py +4 -0
sglang/srt/managers/controller/schedule_heuristic.py +2 -0
sglang/srt/managers/controller/tp_worker.py +156 -134
sglang/srt/managers/detokenizer_manager.py +19 -21
sglang/srt/managers/io_struct.py +11 -5
sglang/srt/managers/tokenizer_manager.py +16 -14
sglang/srt/model_config.py +89 -4
sglang/srt/models/chatglm.py +399 -0
sglang/srt/models/commandr.py +2 -2
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/gemma.py +5 -1
sglang/srt/models/gemma2.py +436 -0
sglang/srt/models/grok.py +204 -137
sglang/srt/models/llama2.py +12 -5
sglang/srt/models/llama_classification.py +107 -0
sglang/srt/models/llava.py +11 -8
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +373 -0
sglang/srt/models/mixtral.py +164 -115
sglang/srt/models/mixtral_quant.py +0 -1
sglang/srt/models/qwen.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +454 -0
sglang/srt/models/stablelm.py +1 -1
sglang/srt/models/yivl.py +2 -2
sglang/srt/openai_api_adapter.py +35 -25
sglang/srt/openai_protocol.py +2 -2
sglang/srt/server.py +69 -19
sglang/srt/server_args.py +76 -43
sglang/srt/utils.py +177 -35
sglang/test/test_programs.py +28 -10
sglang/utils.py +4 -3
{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/METADATA +44 -31
sglang-0.1.19.dist-info/RECORD +81 -0
{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/WHEEL +1 -1
sglang/srt/managers/router/infer_batch.py +0 -596
sglang/srt/managers/router/manager.py +0 -82
sglang/srt/managers/router/model_rpc.py +0 -818
sglang/srt/managers/router/model_runner.py +0 -445
sglang/srt/managers/router/radix_cache.py +0 -267
sglang/srt/managers/router/scheduler.py +0 -59
sglang-0.1.17.dist-info/RECORD +0 -81
{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0

sglang/srt/utils.py CHANGED Viewed

@@ -1,11 +1,13 @@
 """Common utilities."""
 import base64
-import multiprocessing
+import fcntl
 import logging
+import multiprocessing
 import os
 import random
 import socket
+import struct
 import time
 from importlib.metadata import PackageNotFoundError, version
 from io import BytesIO
@@ -17,12 +19,11 @@ import requests
 import rpyc
 import torch
 import triton
-from rpyc.utils.server import ThreadedServer
 from fastapi.responses import JSONResponse
 from packaging import version as pkg_version
+from rpyc.utils.server import ThreadedServer
 from starlette.middleware.base import BaseHTTPMiddleware
 logger = logging.getLogger(__name__)
@@ -370,23 +371,7 @@ def load_image(image_file):
     return image, image_size
-def init_rpyc_service(service: rpyc.Service, port: int):
-    t = ThreadedServer(
-        service=service,
-        port=port,
-        protocol_config={
-            "allow_public_attrs": True,
-            "allow_pickle": True,
-            "sync_request_timeout": 3600
-        },
-    )
-    t.logger.setLevel(logging.WARN)
-    t.start()
-def connect_to_rpyc_service(port, host="localhost"):
-    time.sleep(1)
+def connect_rpyc_service(host, port):
     repeat_count = 0
     while repeat_count < 20:
         try:
@@ -396,26 +381,37 @@ def connect_to_rpyc_service(port, host="localhost"):
                 config={
                     "allow_public_attrs": True,
                     "allow_pickle": True,
-                    "sync_request_timeout": 3600
+                    "sync_request_timeout": 3600,
                 },
             )
             break
-        except ConnectionRefusedError:
+        except ConnectionRefusedError as e:
             time.sleep(1)
         repeat_count += 1
     if repeat_count == 20:
-        raise RuntimeError("init rpc env error!")
+        raise RuntimeError(f"Connect rpyc error: {e}")
     return con.root
-def start_rpyc_process(service: rpyc.Service, port: int):
-    # Return the proxy and the process
-    proc = multiprocessing.Process(target=init_rpyc_service, args=(service, port))
+def start_rpyc_service(service: rpyc.Service, port: int):
+    t = ThreadedServer(
+        service=service,
+        port=port,
+        protocol_config={
+            "allow_public_attrs": True,
+            "allow_pickle": True,
+            "sync_request_timeout": 3600,
+        },
+    )
+    t.logger.setLevel(logging.WARN)
+    t.start()
+def start_rpyc_service_process(service: rpyc.Service, port: int):
+    proc = multiprocessing.Process(target=start_rpyc_service, args=(service, port))
     proc.start()
-    proxy = connect_to_rpyc_service(port)
-    assert proc.is_alive()
-    return proxy, proc
+    return proc
 def suppress_other_loggers():
@@ -423,22 +419,25 @@ def suppress_other_loggers():
     vllm_default_logger.setLevel(logging.WARN)
     logging.getLogger("vllm.config").setLevel(logging.ERROR)
-    logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(logging.WARN)
+    logging.getLogger("vllm.distributed.device_communicators.pynccl").setLevel(
+        logging.WARN
+    )
     logging.getLogger("vllm.selector").setLevel(logging.WARN)
     logging.getLogger("vllm.utils").setLevel(logging.WARN)
-def assert_pkg_version(pkg: str, min_version: str):
+def assert_pkg_version(pkg: str, min_version: str, message: str):
     try:
         installed_version = version(pkg)
         if pkg_version.parse(installed_version) < pkg_version.parse(min_version):
             raise Exception(
-                f"{pkg} is installed with version {installed_version} which "
-                f"is less than the minimum required version {min_version}"
+                f"{pkg} is installed with version {installed_version}, which "
+                f"is less than the minimum required version {min_version}. " + message
             )
     except PackageNotFoundError:
         raise Exception(
-            f"{pkg} with minimum required version {min_version} is not installed"
+            f"{pkg} with minimum required version {min_version} is not installed. "
+            + message
         )
@@ -453,16 +452,75 @@ def kill_parent_process():
     os.kill(parent_process.pid, 9)
-def monkey_patch_vllm_p2p_access_check():
+def monkey_patch_vllm_p2p_access_check(gpu_id: int):
     """
     Monkey patch the slow p2p access check in vllm.
     NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
     """
     import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
     setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
+def monkey_patch_vllm_dummy_weight_loader():
+    """
+    Monkey patch the dummy weight loader in vllm to call process_weights_after_loading.
+    """
+    from vllm.model_executor.model_loader.loader import (
+        CacheConfig,
+        DeviceConfig,
+        DummyModelLoader,
+        LoRAConfig,
+        ModelConfig,
+        ParallelConfig,
+        SchedulerConfig,
+        MultiModalConfig,
+        _initialize_model,
+        initialize_dummy_weights,
+        nn,
+        set_default_torch_dtype,
+    )
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+    ) -> nn.Module:
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                    lora_config,
+                    multimodal_config,
+                    cache_config,
+                )
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    quant_method.process_weights_after_loading(module)
+                # FIXME: Remove this after Mixtral is updated
+                # to use quant_method.
+                if hasattr(module, "process_weights_after_loading"):
+                    module.process_weights_after_loading()
+            # NOTE(woosuk): For accurate performance evaluation, we assign
+            # random values to the weights.
+            initialize_dummy_weights(model)
+        return model.eval()
+    setattr(DummyModelLoader, "load_model", load_model)
 API_KEY_HEADER_NAME = "X-API-Key"
@@ -482,3 +540,87 @@ class APIKeyValidatorMiddleware(BaseHTTPMiddleware):
         response = await call_next(request)
         return response
+def get_ip_address(ifname):
+    """
+    Get the IP address of a network interface.
+    :param ifname: Name of the network interface (e.g., 'eth0')
+    :return: IP address of the network interface
+    """
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    ip_address = fcntl.ioctl(
+        s.fileno(),
+        0x8915,  # SIOCGIFADDR
+        struct.pack("256s", bytes(ifname[:15], "utf-8")),
+    )[20:24]
+    return socket.inet_ntoa(ip_address)
+def send_addrs_to_rank_0(model_port_args, server_args):
+    assert server_args.node_rank != 0 and server_args.dp_size == 1
+    import torch.distributed as dist
+    ifname = os.environ.get(
+        "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
+    )
+    ip_addr = get_ip_address(ifname)
+    num_tp_ports = server_args.tp_size // server_args.nnodes
+    model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
+    ip_addr = [int(x) for x in ip_addr.split(".")]
+    addrs_tensor = torch.tensor(
+        ip_addr + model_port_args.model_tp_ports, dtype=torch.int
+    )
+    init_method = f"tcp://{server_args.nccl_init_addr}"
+    dist.init_process_group(
+        backend="gloo",
+        init_method=init_method,
+        rank=server_args.node_rank,
+        world_size=server_args.nnodes,
+    )
+    dist.send(addrs_tensor, dst=0)
+    print(
+        f"Node {server_args.node_rank} sent: ip_address {ip_addr} and ports {model_port_args.model_tp_ports}"
+    )
+    dist.barrier()
+    dist.destroy_process_group()
+def receive_addrs(model_port_args, server_args):
+    assert server_args.node_rank == 0 and server_args.dp_size == 1
+    import torch.distributed as dist
+    ifname = os.environ.get(
+        "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
+    )
+    ip_addr = get_ip_address(ifname)
+    num_tp_ports = server_args.tp_size // server_args.nnodes
+    model_port_args.model_tp_ips[:num_tp_ports] = [ip_addr] * num_tp_ports
+    init_method = f"tcp://{server_args.nccl_init_addr}"
+    dist.init_process_group(
+        backend="gloo",
+        init_method=init_method,
+        rank=server_args.node_rank,
+        world_size=server_args.nnodes,
+    )
+    for src_rank in range(1, server_args.nnodes):
+        tensor = torch.zeros(4 + num_tp_ports, dtype=torch.int)
+        dist.recv(tensor, src=src_rank)
+        ip = ".".join([str(x) for x in tensor[:4].tolist()])
+        ports = tensor[4:].tolist()
+        model_port_args.model_tp_ips[
+            num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
+        ] = [ip] * num_tp_ports
+        model_port_args.model_tp_ports[
+            num_tp_ports * src_rank : num_tp_ports * (src_rank + 1)
+        ] = ports
+        print(f"Node 0 received from rank {src_rank}: {tensor.tolist()}")
+    dist.barrier()
+    dist.destroy_process_group()

sglang/test/test_programs.py CHANGED Viewed

@@ -1,6 +1,4 @@
-"""
-This file contains the SGL programs used for unit testing.
-"""
+"""This file contains the SGL programs used for unit testing."""
 import json
 import re
@@ -358,16 +356,25 @@ def test_completion_speculative():
         s += "Construct a character within the following format:\n"
         s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
         s += "\nPlease generate new Name, Birthday and Job.\n"
-        s += "Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n")
+        s += (
+            "Name:"
+            + sgl.gen("name", stop="\n")
+            + "\nBirthday:"
+            + sgl.gen("birthday", stop="\n")
+        )
         s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
     @sgl.function
     def gen_character_no_spec(s):
         s += "Construct a character within the following format:\n"
         s += "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
         s += "\nPlease generate new Name, Birthday and Job.\n"
-        s += "Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n")
+        s += (
+            "Name:"
+            + sgl.gen("name", stop="\n")
+            + "\nBirthday:"
+            + sgl.gen("birthday", stop="\n")
+        )
         s += "\nJob:" + sgl.gen("job", stop="\n") + "\n"
     token_usage = sgl.global_config.default_backend.token_usage
@@ -380,7 +387,9 @@ def test_completion_speculative():
     gen_character_no_spec().sync()
     usage_with_no_spec = token_usage.prompt_tokens
-    assert usage_with_spec < usage_with_no_spec, f"{usage_with_spec} vs {usage_with_no_spec}"
+    assert (
+        usage_with_spec < usage_with_no_spec
+    ), f"{usage_with_spec} vs {usage_with_no_spec}"
 def test_chat_completion_speculative():
@@ -388,8 +397,17 @@ def test_chat_completion_speculative():
     def gen_character_spec(s):
         s += sgl.system("You are a helpful assistant.")
         s += sgl.user("Construct a character within the following format:")
-        s += sgl.assistant("Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n")
+        s += sgl.assistant(
+            "Name: Steve Jobs.\nBirthday: February 24, 1955.\nJob: Apple CEO.\n"
+        )
         s += sgl.user("Please generate new Name, Birthday and Job.\n")
-        s += sgl.assistant("Name:" + sgl.gen("name", stop="\n") + "\nBirthday:" + sgl.gen("birthday", stop="\n") + "\nJob:" + sgl.gen("job", stop="\n"))
+        s += sgl.assistant(
+            "Name:"
+            + sgl.gen("name", stop="\n")
+            + "\nBirthday:"
+            + sgl.gen("birthday", stop="\n")
+            + "\nJob:"
+            + sgl.gen("job", stop="\n")
+        )
-    gen_character_spec().sync()
+    gen_character_spec().sync()

sglang/utils.py CHANGED Viewed

@@ -15,7 +15,6 @@ from json import dumps
 import numpy as np
 import requests
 logger = logging.getLogger(__name__)
@@ -255,8 +254,10 @@ def run_with_timeout(func, args=(), kwargs=None, timeout=None):
 def graceful_registry(sub_module_name):
     def graceful_shutdown(signum, frame):
-        logger.info(f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown...")
+        logger.info(
+            f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
+        )
         if signum == signal.SIGTERM:
             logger.info(f"{sub_module_name} recive sigterm")
-    signal.signal(signal.SIGTERM, graceful_shutdown)
+    signal.signal(signal.SIGTERM, graceful_shutdown)

{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.17
+Version: 0.1.19
 Summary: A structured generation langauge for LLMs.
 License: Apache License
                                    Version 2.0, January 2004
@@ -213,6 +213,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: requests
 Requires-Dist: tqdm
+Requires-Dist: numpy
 Provides-Extra: all
 Requires-Dist: sglang[srt] ; extra == 'all'
 Requires-Dist: sglang[openai] ; extra == 'all'
@@ -220,30 +221,28 @@ Requires-Dist: sglang[anthropic] ; extra == 'all'
 Requires-Dist: sglang[litellm] ; extra == 'all'
 Provides-Extra: anthropic
 Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
-Requires-Dist: numpy ; extra == 'anthropic'
 Provides-Extra: litellm
 Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
 Provides-Extra: openai
 Requires-Dist: openai >=1.0 ; extra == 'openai'
-Requires-Dist: numpy ; extra == 'openai'
 Requires-Dist: tiktoken ; extra == 'openai'
 Provides-Extra: srt
 Requires-Dist: aiohttp ; extra == 'srt'
 Requires-Dist: fastapi ; extra == 'srt'
+Requires-Dist: hf-transfer ; extra == 'srt'
+Requires-Dist: huggingface-hub ; extra == 'srt'
+Requires-Dist: interegular ; extra == 'srt'
+Requires-Dist: packaging ; extra == 'srt'
+Requires-Dist: pillow ; extra == 'srt'
 Requires-Dist: psutil ; extra == 'srt'
+Requires-Dist: pydantic ; extra == 'srt'
 Requires-Dist: rpyc ; extra == 'srt'
 Requires-Dist: torch ; extra == 'srt'
-Requires-Dist: uvloop ; extra == 'srt'
 Requires-Dist: uvicorn ; extra == 'srt'
+Requires-Dist: uvloop ; extra == 'srt'
 Requires-Dist: zmq ; extra == 'srt'
-Requires-Dist: vllm ==0.4.3 ; extra == 'srt'
-Requires-Dist: interegular ; extra == 'srt'
-Requires-Dist: pydantic ; extra == 'srt'
-Requires-Dist: pillow ; extra == 'srt'
-Requires-Dist: packaging ; extra == 'srt'
-Requires-Dist: huggingface-hub ; extra == 'srt'
-Requires-Dist: hf-transfer ; extra == 'srt'
-Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
+Requires-Dist: vllm ==0.5.1 ; extra == 'srt'
+Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
 <div align="center">
 <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
 It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
 The core features include:
-- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
-- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
+- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
+- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
 ## News
 - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -279,19 +278,33 @@ The core features include:
 ### Method 1: With pip
 ```
 pip install "sglang[all]"
+# Install FlashInfer CUDA kernels
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ```
 ### Method 2: From source
 ```
-git clone git@github.com:sgl-project/sglang.git
+git clone https://github.com/sgl-project/sglang.git
 cd sglang
-pip install --upgrade pip
 pip install -e "python[all]"
+# Install FlashInfer CUDA kernels
+pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ```
-### Notes
-- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
+### Method 3: Using docker
+The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
+### Common Notes
+- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
+```
+pip uninstall -y triton triton-nightly
+pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
+```
+- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
+- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 ## Quick Start
 The example below shows how to use sglang to answer a mulit-turn question.
@@ -511,8 +524,8 @@ for out in state.text_iter():
 ```
 ### Tips and Implementation Details
-- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
-- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
+- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
+- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
 ## Backend: SGLang Runtime (SRT)
 The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -569,7 +582,6 @@ response = client.chat.completions.create(
 print(response)
 ```
 By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
 If needed, you can also override the chat template when launching the server:
@@ -598,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
 ```
 ### Additional Arguments
-- Add `--tp 2` to enable tensor parallelism.
+- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
 ```
@@ -610,16 +622,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
 ```
-- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
 - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
 ### Supported Models
 - Llama
 - Mistral
 - Mixtral
-- Qwen / Qwen 2
-- Gemma
-  - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
+- Qwen / Qwen 2 / Qwen 2 MoE
+- Gemma / Gemma 2
   - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
 - LLaVA
   - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -632,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
 - StableLM
 - Command-R
 - DBRX
+- Grok
+- ChatGLM
 - AWQ/GPTQ/Marlin quantization
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
@@ -643,17 +655,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
 - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
 ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
-Learn more [here](docs/benchmark_results.md).
+- Learn more about the above [results](docs/benchmark_results.md).
+- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
 ## Roadmap
 https://github.com/sgl-project/sglang/issues/157
 ## Citation And Acknowledgment
 ```
-@misc{zheng2023efficiently,
-      title={Efficiently Programming Large Language Models using SGLang},
-      author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
-      year={2023},
+@misc{zheng2024sglang,
+      title={SGLang: Efficient Execution of Structured Language Model Programs},
+      author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
+      year={2024},
       eprint={2312.07104},
       archivePrefix={arXiv},
       primaryClass={cs.AI}

sglang-0.1.19.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,81 @@
+sglang/__init__.py,sha256=GriWuMrszCcPLrLQRv50jP0Crc6b8CLsBA3UYM36ISw,1116
+sglang/api.py,sha256=W_FO5JTrW9I-DoGx2O8cLhcSA6LJqgplrOIqAX-ryNA,5560
+sglang/bench_latency.py,sha256=Ln3DbLmTwIhgsiFZH0_L5Fd3Sc5jM_Vb9PFZytX76hM,10299
+sglang/global_config.py,sha256=1HsHrPFgkqCc5iIwrweKQ0HLip0DLogtpm9vaqbZqfE,1426
+sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
+sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
+sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
+sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
+sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
+sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
+sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
+sglang/backend/runtime_endpoint.py,sha256=XTHAoN_EAwdfADc6vq9tuqri7udGMUih8dStgTuKV1g,9077
+sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
+sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+sglang/lang/chat_template.py,sha256=hLX1qpXaUQi7PFndAwbOoOeGlX0NekskR_HndAvGnwQ,13307
+sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
+sglang/lang/interpreter.py,sha256=0phpQs4PooVvVJCzzyNrTv2OFevI5fsU1FcN4roxqhY,29628
+sglang/lang/ir.py,sha256=ZGXJbJELlt8D8H7CyW3IqcRpZm8Pp7h_hLQw46NSb6I,16639
+sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
+sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
+sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
+sglang/srt/hf_transformers_utils.py,sha256=H3YnLtx05q65A1tn1JWNZOUhMtq6jANRhhMo6JJr6mg,10728
+sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
+sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
+sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
+sglang/srt/openai_api_adapter.py,sha256=iw-FquXQeM2Z4nxOoYGFPjTkIdgA8rQkh_IcmJRy-R0,15143
+sglang/srt/openai_protocol.py,sha256=-KJsGx2izL3Fc5EhOGi9PAXExuaq-DKRk0UlNjts11E,5348
+sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
+sglang/srt/server.py,sha256=ntl5XwnbOm2favQWbqVULXBUOLhXsgZ3mf1i2MY4e14,13226
+sglang/srt/server_args.py,sha256=rvJImd-b9CVveg_V7n7dSotlro6q6pAqBk7lOxRC7nk,12307
+sglang/srt/utils.py,sha256=e-yPzqDMCGsPgEf4TIe7CEh44lsKpZnclsrMtBggS_Y,19366
+sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
+sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
+sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
+sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
+sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
+sglang/srt/layers/extend_attention.py,sha256=sVd94ViwwQaQDuE94sPMg6Ac6VOp7nX80hFol8qr85Q,13008
+sglang/srt/layers/fused_moe.py,sha256=uyrbCaIHioq3G00xQUrCo53hYDoHzk5rep3Eji3oQiQ,20258
+sglang/srt/layers/logits_processor.py,sha256=RCHjWxlKlB_Mc2iOMHQKvKN9gjqg4oqgodS6gr3qCbA,9672
+sglang/srt/layers/radix_attention.py,sha256=e468GCYteIuVOW7T9xols-IqXS0hJysmicvEiwD0xIM,6857
+sglang/srt/layers/token_attention.py,sha256=eKUUU5pvYsF5EGthfbv-L_IUlg366l5e5X1eWTkE_Xw,8908
+sglang/srt/managers/detokenizer_manager.py,sha256=2oYNtYrSwtfu8G-QcFz_vZK6Buq-eHuZGg9VpxVhYOI,3492
+sglang/srt/managers/io_struct.py,sha256=aCI4yYtKoioP459lWRN8kqVf4tvYYr_IhZaSnvJylgY,4533
+sglang/srt/managers/tokenizer_manager.py,sha256=h5nOR8NHCwEm52wiL-ZA1hoM_pvMuyG0j7Zj1h7aMxk,14898
+sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
+sglang/srt/managers/controller/infer_batch.py,sha256=wOuvi4lNhVEZtfXZKinBXCubG_VEaRTv60ijbHpSMgM,25713
+sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
+sglang/srt/managers/controller/manager_single.py,sha256=5c33d1jPgOtys5gmfZe79UD7aXrsV--1Yq9Yc24bh1g,3469
+sglang/srt/managers/controller/model_runner.py,sha256=a-1RKjA12U11BvDbnOECyPf6rpxes895pEZ0-Hyxo6c,21888
+sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
+sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
+sglang/srt/managers/controller/tp_worker.py,sha256=WBqL5_VVDAf3o12ymZwxQn7RYZ_dm_w2dXCnMVQ5L3M,31828
+sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
+sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
+sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
+sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
+sglang/srt/models/gemma2.py,sha256=x3Dua-TVwRm5fJjo5UDekdoWqwt9xYbMuB-ogfXyiT8,15860
+sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
+sglang/srt/models/llama2.py,sha256=FIUlkFoBhRNidU_Tlcr4UbSqzKPdz3wBc9OocN_CzQs,12188
+sglang/srt/models/llama_classification.py,sha256=bLuugRFcPGEaNd58_LFOkWqOru2rCAGChhBw9dSu7pc,4349
+sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
+sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
+sglang/srt/models/minicpm.py,sha256=vYCGjUjYIYVroiV2kOXWdWIPF6__vkN8JnRK-DqgKNI,13271
+sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
+sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
+sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
+sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
+sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
+sglang/srt/models/qwen2_moe.py,sha256=hV3dF_AzYONd-pQEmEkrrwpTZC6A7K4wY1_cph9UC54,18421
+sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
+sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
+sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
+sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
+sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
+sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
+sglang-0.1.19.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.1.19.dist-info/METADATA,sha256=iSIkO_DxfMHQIEv7ZdMXWwi_weLZtf8YRNS80vjf1Kk,30262
+sglang-0.1.19.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
+sglang-0.1.19.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.1.19.dist-info/RECORD,,

{sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (70.2.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

sglang 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

sglang 0.1.17py3-none-any.whl → 0.1.19py3-none-any.whl