PyPI - sglang - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl - Mend

sglang 0.2.13py3-none-any.whl → 0.2.14.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sglang/api.py +6 -0
sglang/bench_latency.py +7 -3
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +9 -0
sglang/launch_server.py +8 -1
sglang/srt/constrained/fsm_cache.py +11 -2
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +100 -1
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/fused_moe/layer.py +2 -2
sglang/srt/layers/logits_processor.py +56 -19
sglang/srt/layers/radix_attention.py +3 -4
sglang/srt/layers/sampler.py +101 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +46 -166
sglang/srt/managers/tokenizer_manager.py +192 -83
sglang/srt/managers/tp_worker.py +118 -24
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +32 -8
sglang/srt/model_executor/forward_batch_info.py +51 -26
sglang/srt/model_executor/model_runner.py +201 -58
sglang/srt/models/gemma2.py +10 -6
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +11 -1
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/qwen2.py +9 -3
sglang/srt/openai_api/adapter.py +200 -39
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/sampling/sampling_batch_info.py +136 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +22 -0
sglang/srt/server.py +92 -57
sglang/srt/server_args.py +43 -15
sglang/srt/utils.py +26 -16
sglang/test/runners.py +22 -30
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_utils.py +36 -53
sglang/version.py +1 -1
{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/METADATA +100 -27
sglang-0.2.14.post1.dist-info/RECORD +114 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang-0.2.13.dist-info/RECORD +0 -112
{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/LICENSE +0 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/top_level.txt +0 -0

sglang/test/simple_eval_common.py CHANGED Viewed

@@ -1,13 +1,12 @@
 # Adapted from https://github.com/openai/simple-evals/
-import base64
 import os
 import resource
 import time
 from collections import defaultdict
 from dataclasses import dataclass, field
 from multiprocessing.pool import ThreadPool
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 import httpx
 import jinja2
@@ -44,8 +43,8 @@ class EvalResult:
     Result of running an evaluation (usually consisting of many samples)
     """
-    score: float | None  # top-line metric
-    metrics: Dict[str, float] | None  # other metrics
+    score: Optional[float]  # top-line metric
+    metrics: Optional[Dict[str, float]]  # other metrics
     htmls: List[str]  # strings of valid HTML
     convos: List[MessageList]  # sampled conversations
@@ -56,10 +55,10 @@ class SingleEvalResult:
     Result of evaluating a single sample
     """
-    score: float | None
+    score: Optional[float]
     metrics: Dict[str, float] = field(default_factory=dict)
-    html: str | None = None
-    convo: MessageList | None = None  # sampled conversation
+    html: Optional[str] = None
+    convo: Optional[MessageList] = None  # sampled conversation
 class Eval:
@@ -89,8 +88,8 @@ class ChatCompletionSampler(SamplerBase):
     def __init__(
         self,
         base_url: str = None,
-        model: str | None = None,
-        system_message: str | None = None,
+        model: Optional[str] = None,
+        system_message: Optional[str] = None,
         temperature: float = 0.0,
         max_tokens: int = 2048,
     ):
@@ -272,7 +271,7 @@ def _compute_stat(values: list, stat: str):
 def aggregate_results(
     single_eval_results: List[SingleEvalResult],
     default_stats: Tuple[str] = ("mean", "std"),
-    name2stats: Dict[str, Tuple[str]] | None = None,
+    name2stats: Optional[Dict[str, Tuple[str]]] = None,
 ) -> EvalResult:
     """
     Aggregate results from multiple evaluations into a single EvalResult.

sglang/test/simple_eval_gpqa.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2311.12022
 import random
 import re
+from typing import Optional
 import pandas
@@ -28,7 +29,7 @@ class GPQAEval(Eval):
     def __init__(
         self,
         filename: str,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
         n_repeats: int = 1,
     ):

sglang/test/simple_eval_humaneval.py CHANGED Viewed

@@ -9,7 +9,7 @@ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
 import random
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, List
+from typing import Dict, List, Optional
 import tqdm
@@ -61,7 +61,7 @@ def evaluate_functional_correctness(
 class HumanEval(Eval):
     def __init__(
         self,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
         num_samples_per_task: int = 5,
         ks_passes: List[int] = [1, 2, 5],

sglang/test/simple_eval_math.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2103.03874
 import random
 import re
+from typing import Optional
 import pandas
@@ -36,7 +37,7 @@ class MathEval(Eval):
         self,
         filename: str,
         equality_checker: SamplerBase,
-        num_examples: int | None,
+        num_examples: Optional[int],
         num_threads: int,
     ):
         df = pandas.read_csv(filename)

sglang/test/simple_eval_mmlu.py CHANGED Viewed

@@ -8,6 +8,7 @@ https://arxiv.org/abs/2009.03300
 import random
 import re
+from typing import Optional
 import pandas
@@ -84,7 +85,7 @@ subject2category = {
 class MMLUEval(Eval):
-    def __init__(self, filename: str, num_examples: int | None, num_threads: int):
+    def __init__(self, filename: str, num_examples: Optional[int], num_threads: int):
         df = pandas.read_csv(filename)
         examples = [row.to_dict() for _, row in df.iterrows()]
         if num_examples:

sglang/test/test_activation.py ADDED Viewed

@@ -0,0 +1,55 @@
+import itertools
+import unittest
+import torch
+from sglang.srt.layers.activation import GeluAndMul
+class TestGeluAndMul(unittest.TestCase):
+    DTYPES = [torch.half, torch.bfloat16]
+    NUM_TOKENS = [7, 83, 2048]
+    D = [512, 4096, 5120, 13824]
+    SEEDS = [0]
+    @classmethod
+    def setUpClass(cls):
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("CUDA is not available")
+        torch.set_default_device("cuda")
+    def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed):
+        torch.manual_seed(seed)
+        layer = GeluAndMul().to(dtype=dtype)
+        x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+        with torch.inference_mode():
+            ref_out = layer.forward_native(x)
+            out = layer.forward_cuda(x)
+        if dtype == torch.bfloat16:
+            atol = rtol = 1e-2
+        else:
+            atol = rtol = 1e-3
+        self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol))
+    def test_gelu_and_mul(self):
+        for params in itertools.product(
+            self.NUM_TOKENS,
+            self.D,
+            self.DTYPES,
+            self.SEEDS,
+        ):
+            with self.subTest(
+                num_tokens=params[0],
+                d=params[1],
+                dtype=params[2],
+                seed=params[3],
+            ):
+                self._run_gelu_and_mul_test(*params)
+if __name__ == "__main__":
+    unittest.main(verbosity=2)

sglang/test/test_utils.py CHANGED Viewed

@@ -2,11 +2,10 @@
 import argparse
 import asyncio
-import multiprocessing
+import os
 import subprocess
 import threading
 import time
-import unittest
 from functools import partial
 from typing import Callable, List, Optional
@@ -18,14 +17,19 @@ import torch.nn.functional as F
 from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+from sglang.srt.utils import kill_child_process
 from sglang.utils import get_exception_traceback
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157"
-DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157"
-DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157"
-DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157"
+DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
+if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
+    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
+else:
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
+    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -104,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
     return pred
-def call_generate_ginfer(prompt, temperature, max_tokens, stop=None, url=None):
-    import grpc
-    from ginfer import sampler_pb2, sampler_pb2_grpc
-    sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
-    sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
-    if stop is None:
-        stop_strings = None
-    else:
-        stop_strings = [stop]
-    sample_request = sampler_pb2.SampleTextRequest(
-        prompt=prompt,
-        settings=sampler_pb2.SampleSettings(
-            max_len=max_tokens,
-            rng_seed=0,
-            temperature=max(temperature, 1e-7),
-            nucleus_p=1,
-            stop_strings=stop_strings,
-        ),
-    )
-    stream = sampler.SampleText(sample_request)
-    response = "".join([x.text for x in stream])
-    return response
+def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
+    raise NotImplementedError()
 def call_generate_guidance(
@@ -271,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
             "vllm",
             "outlines",
             "lightllm",
-            "ginfer",
+            "gserver",
             "guidance",
             "lmql",
             "srt-raw",
@@ -292,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
             "lightllm": 22000,
             "lmql": 23000,
             "srt-raw": 30000,
-            "ginfer": 9988,
+            "gserver": 9988,
         }
         args.port = default_port.get(args.backend, None)
     return args
@@ -328,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
         return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
     elif args.backend == "srt-raw":
         return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
-    elif args.backend == "ginfer":
-        return partial(call_generate_ginfer, url=f"{args.host}:{args.port}")
+    elif args.backend == "gserver":
+        return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
     elif args.backend == "outlines":
         return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
     elif args.backend == "guidance":
@@ -480,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
     success = True
     for filename in files:
+        global process
-        def func():
-            print(f"\n\nRun {filename}\n\n")
-            ret = unittest.main(module=None, argv=["", "-vb"] + [filename])
-        p = multiprocessing.Process(target=func)
-        def run_one_file():
-            p.start()
-            p.join()
+        def run_one_file(filename):
+            filename = os.path.join(os.getcwd(), filename)
+            print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
+            process = subprocess.Popen(
+                ["python3", filename], stdout=None, stderr=None, env=os.environ
+            )
+            process.wait()
+            return process.returncode
         try:
-            run_with_timeout(run_one_file, timeout=timeout_per_file)
-            if p.exitcode != 0:
-                success = False
-                break
+            ret_code = run_with_timeout(
+                run_one_file, args=(filename,), timeout=timeout_per_file
+            )
+            assert ret_code == 0
         except TimeoutError:
-            p.terminate()
+            kill_child_process(process.pid)
             time.sleep(5)
             print(
-                f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
+                f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
+                flush=True,
             )
-            return False
+            success = False
+            break
     if success:
-        print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
+        print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
     else:
-        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
+        print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
     return 0 if success else -1

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.13"
1	+ __version__ = "0.2.14.post1"

{sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.13
+Version: 0.2.14.post1
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License: Apache License
                                    Version 2.0, January 2004
@@ -231,6 +231,7 @@ Requires-Dist: openai>=1.0; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
 Provides-Extra: srt
 Requires-Dist: aiohttp; extra == "srt"
+Requires-Dist: decord; extra == "srt"
 Requires-Dist: fastapi; extra == "srt"
 Requires-Dist: hf-transfer; extra == "srt"
 Requires-Dist: huggingface-hub; extra == "srt"
@@ -244,12 +245,14 @@ Requires-Dist: torch; extra == "srt"
 Requires-Dist: uvicorn; extra == "srt"
 Requires-Dist: uvloop; extra == "srt"
 Requires-Dist: zmq; extra == "srt"
-Requires-Dist: vllm==0.5.4; extra == "srt"
+Requires-Dist: vllm==0.5.5; extra == "srt"
 Requires-Dist: outlines>=0.0.44; extra == "srt"
 Provides-Extra: test
 Requires-Dist: jsonlines; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
 Requires-Dist: pandas; extra == "test"
+Requires-Dist: sentence-transformers; extra == "test"
+Requires-Dist: accelerate; extra == "test"
 <div align="center">
 <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
+- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
 - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
 ## News
 - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
-- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
+- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
 <details>
 <summary>More</summary>
+- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
 - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
 - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
+git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -334,14 +338,60 @@ docker run --gpus all \
 ### Method 4: Using docker compose
+<details>
+<summary>More</summary>
 > This method is recommended if you plan to serve it as a service.
 > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
 2. Execute the command `docker compose up -d` in your terminal.
+</details>
+### Method 5: Run on Kubernetes or Clouds with SkyPilot
+<details>
+<summary>More</summary>
+To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
+1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
+2. Deploy on your own infra with a single command and get the HTTP API endpoint:
+<details>
+<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
+```yaml
+# sglang.yaml
+envs:
+  HF_TOKEN: null
+resources:
+  image_id: docker:lmsysorg/sglang:latest
+  accelerators: A100
+  ports: 30000
+run: |
+  conda deactivate
+  python3 -m sglang.launch_server \
+    --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --host 0.0.0.0 \
+    --port 30000
+```
+</details>
+```bash
+# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
+HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
+# Get the HTTP API endpoint
+sky status --endpoint 30000 sglang
+```
+3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
+</details>
 ### Common Notes
-- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 ## Backend: SGLang Runtime (SRT)
@@ -395,6 +445,13 @@ response = client.chat.completions.create(
     max_tokens=64,
 )
 print(response)
+# Text embedding
+response = client.embeddings.create(
+    model="default",
+    input="How are you today",
+)
+print(response)
 ```
 It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -431,19 +488,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ### Supported Models
+**Generative Models**
 - Llama / Llama 2 / Llama 3 / Llama 3.1
 - Mistral / Mixtral / Mistral NeMo
 - Gemma / Gemma 2
 - Qwen / Qwen 2 / Qwen 2 MoE
 - DeepSeek / DeepSeek 2
-- LLaVA 1.5 / 1.6
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
-  - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
-- LLaVA-NeXT-Video
-  - see [examples/usage/llava_video](examples/usage/llava_video)
+- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
+  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
+  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
+- LLaVA 1.5 / 1.6 / NeXT
+  - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
+  - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
+  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
 - Yi-VL
-  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
 - StableLM
 - Command-R
 - DBRX
@@ -451,37 +510,52 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - ChatGLM
 - InternLM 2
+**Embedding Models**
+- e5-mistral
+- gte-Qwen2
+  - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
 Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
 #### Use Models From ModelScope
-To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
+<details>
+<summary>More</summary>
+To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
 ```
 export SGLANG_USE_MODELSCOPE=true
 ```
 Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
 ```
 SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
-```
+```
+</details>
 #### Run Llama 3.1 405B
+<details>
+<summary>More</summary>
 ```bash
-## Run 405B (fp8) on a single node
+# Run 405B (fp8) on a single node
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
-## Run 405B (fp16) on two nodes
-# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
-# on the first node
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
+# Run 405B (fp16) on two nodes
+## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
-# on the second
-GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
+## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
+GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
 ```
+</details>
 ### Benchmark Performance
-- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
+- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
+  Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
+  A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
   ```
   python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
   ```
@@ -614,7 +688,7 @@ def tip_suggestion(s):
     s += "In summary" + sgl.gen("summary")
 ```
-#### Multi Modality
+#### Multi-Modality
 Use `sgl.image` to pass an image as input.
 ```python
@@ -668,7 +742,7 @@ def character_gen(s, name):
     s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
 ```
-See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
+See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
 #### Batching
 Use `run_batch` to run a batch of requests with continuous batching.
@@ -730,7 +804,6 @@ def chat_example(s):
 - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
 - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
 ## Benchmark And Performance
 ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
 ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)

sglang 0.2.13__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl

sglang 0.2.13py3-none-any.whl → 0.2.14.post1py3-none-any.whl