sglang 0.1.21__py3-none-any.whl → 0.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. sglang/__init__.py +8 -8
  2. sglang/api.py +1 -1
  3. sglang/backend/vertexai.py +5 -4
  4. sglang/bench.py +627 -0
  5. sglang/bench_latency.py +22 -19
  6. sglang/bench_serving.py +758 -0
  7. sglang/check_env.py +171 -0
  8. sglang/lang/backend/__init__.py +0 -0
  9. sglang/lang/backend/anthropic.py +77 -0
  10. sglang/lang/backend/base_backend.py +80 -0
  11. sglang/lang/backend/litellm.py +90 -0
  12. sglang/lang/backend/openai.py +438 -0
  13. sglang/lang/backend/runtime_endpoint.py +283 -0
  14. sglang/lang/backend/vertexai.py +149 -0
  15. sglang/lang/tracer.py +1 -1
  16. sglang/launch_server.py +1 -1
  17. sglang/launch_server_llavavid.py +1 -4
  18. sglang/srt/conversation.py +1 -1
  19. sglang/srt/layers/context_flashattention_nopad.py +0 -29
  20. sglang/srt/layers/extend_attention.py +0 -39
  21. sglang/srt/layers/linear.py +869 -0
  22. sglang/srt/layers/quantization/__init__.py +49 -0
  23. sglang/srt/layers/quantization/fp8.py +662 -0
  24. sglang/srt/layers/radix_attention.py +31 -5
  25. sglang/srt/layers/token_attention.py +1 -51
  26. sglang/srt/managers/controller/cuda_graph_runner.py +14 -12
  27. sglang/srt/managers/controller/infer_batch.py +47 -49
  28. sglang/srt/managers/controller/manager_multi.py +107 -100
  29. sglang/srt/managers/controller/manager_single.py +76 -96
  30. sglang/srt/managers/controller/model_runner.py +35 -23
  31. sglang/srt/managers/controller/tp_worker.py +127 -138
  32. sglang/srt/managers/detokenizer_manager.py +49 -5
  33. sglang/srt/managers/io_struct.py +36 -17
  34. sglang/srt/managers/tokenizer_manager.py +228 -125
  35. sglang/srt/memory_pool.py +19 -6
  36. sglang/srt/model_loader/model_loader.py +277 -0
  37. sglang/srt/model_loader/utils.py +260 -0
  38. sglang/srt/models/chatglm.py +1 -0
  39. sglang/srt/models/dbrx.py +1 -0
  40. sglang/srt/models/grok.py +1 -0
  41. sglang/srt/models/internlm2.py +317 -0
  42. sglang/srt/models/llama2.py +65 -16
  43. sglang/srt/models/llama_classification.py +1 -0
  44. sglang/srt/models/llava.py +1 -0
  45. sglang/srt/models/llavavid.py +1 -0
  46. sglang/srt/models/minicpm.py +1 -0
  47. sglang/srt/models/mixtral.py +1 -0
  48. sglang/srt/models/mixtral_quant.py +1 -0
  49. sglang/srt/models/qwen.py +1 -0
  50. sglang/srt/models/qwen2.py +6 -0
  51. sglang/srt/models/qwen2_moe.py +7 -4
  52. sglang/srt/models/stablelm.py +1 -0
  53. sglang/srt/openai_api/adapter.py +432 -0
  54. sglang/srt/openai_api/api_adapter.py +432 -0
  55. sglang/srt/openai_api/openai_api_adapter.py +431 -0
  56. sglang/srt/openai_api/openai_protocol.py +207 -0
  57. sglang/srt/openai_api/protocol.py +208 -0
  58. sglang/srt/openai_protocol.py +17 -0
  59. sglang/srt/sampling_params.py +2 -0
  60. sglang/srt/server.py +113 -84
  61. sglang/srt/server_args.py +23 -15
  62. sglang/srt/utils.py +16 -117
  63. sglang/test/test_conversation.py +1 -1
  64. sglang/test/test_openai_protocol.py +1 -1
  65. sglang/test/test_programs.py +1 -1
  66. sglang/test/test_utils.py +2 -2
  67. {sglang-0.1.21.dist-info → sglang-0.1.22.dist-info}/METADATA +157 -167
  68. sglang-0.1.22.dist-info/RECORD +103 -0
  69. {sglang-0.1.21.dist-info → sglang-0.1.22.dist-info}/WHEEL +1 -1
  70. sglang-0.1.21.dist-info/RECORD +0 -82
  71. {sglang-0.1.21.dist-info → sglang-0.1.22.dist-info}/LICENSE +0 -0
  72. {sglang-0.1.21.dist-info → sglang-0.1.22.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -3,9 +3,9 @@
3
3
  import base64
4
4
  import fcntl
5
5
  import logging
6
- import multiprocessing
7
6
  import os
8
7
  import random
8
+ import resource
9
9
  import socket
10
10
  import struct
11
11
  import time
@@ -16,12 +16,11 @@ from typing import List, Optional
16
16
  import numpy as np
17
17
  import psutil
18
18
  import requests
19
- import rpyc
20
19
  import torch
20
+ import torch.distributed as dist
21
21
  import triton
22
22
  from fastapi.responses import JSONResponse
23
23
  from packaging import version as pkg_version
24
- from rpyc.utils.server import ThreadedServer
25
24
  from starlette.middleware.base import BaseHTTPMiddleware
26
25
 
27
26
  logger = logging.getLogger(__name__)
@@ -148,7 +147,6 @@ def is_port_available(port):
148
147
  def allocate_init_ports(
149
148
  port: Optional[int] = None,
150
149
  additional_ports: Optional[List[int]] = None,
151
- tp_size: int = 1,
152
150
  dp_size: int = 1,
153
151
  ):
154
152
  """Allocate ports for all connections."""
@@ -160,8 +158,8 @@ def allocate_init_ports(
160
158
  ret_ports = list(set(x for x in ret_ports if is_port_available(x)))
161
159
  cur_port = ret_ports[-1] + 1 if len(ret_ports) > 0 else 10000
162
160
 
163
- # HTTP + Tokenizer + Controller + Detokenizer + dp_size * (nccl + tp_size)
164
- num_ports_needed = 4 + dp_size * (1 + tp_size)
161
+ # HTTP + Tokenizer + Controller + Detokenizer + dp_size * 1 (nccl)
162
+ num_ports_needed = 4 + dp_size
165
163
  while len(ret_ports) < num_ports_needed:
166
164
  if cur_port not in ret_ports and is_port_available(cur_port):
167
165
  ret_ports.append(cur_port)
@@ -188,71 +186,6 @@ def get_int_token_logit_bias(tokenizer, vocab_size):
188
186
  return logit_bias
189
187
 
190
188
 
191
- def wrap_kernel_launcher(kernel):
192
- """A faster launcher for triton kernels."""
193
- if int(triton.__version__.split(".")[0]) >= 3:
194
- return None
195
-
196
- gpu_id = torch.cuda.current_device()
197
- kernels = kernel.cache[gpu_id].values()
198
- kernel = next(iter(kernels))
199
-
200
- # Different trition versions use different low-level names
201
- if hasattr(kernel, "cu_function"):
202
- kfunction = kernel.cu_function
203
- else:
204
- kfunction = kernel.function
205
-
206
- if hasattr(kernel, "c_wrapper"):
207
- run = kernel.c_wrapper
208
- else:
209
- run = kernel.run
210
-
211
- add_cluster_dim = True
212
-
213
- def ret_func(grid, num_warps, *args):
214
- nonlocal add_cluster_dim
215
-
216
- try:
217
- if add_cluster_dim:
218
- run(
219
- grid[0],
220
- grid[1],
221
- grid[2],
222
- num_warps,
223
- 1,
224
- 1,
225
- 1,
226
- 1,
227
- kernel.shared,
228
- 0,
229
- kfunction,
230
- None,
231
- None,
232
- kernel,
233
- *args,
234
- )
235
- else:
236
- run(
237
- grid[0],
238
- grid[1],
239
- grid[2],
240
- num_warps,
241
- kernel.shared,
242
- 0,
243
- kfunction,
244
- None,
245
- None,
246
- kernel,
247
- *args,
248
- )
249
- except TypeError:
250
- add_cluster_dim = not add_cluster_dim
251
- ret_func(grid, num_warps, *args)
252
-
253
- return ret_func
254
-
255
-
256
189
  def is_multimodal_model(model):
257
190
  from sglang.srt.model_config import ModelConfig
258
191
 
@@ -371,49 +304,6 @@ def load_image(image_file):
371
304
  return image, image_size
372
305
 
373
306
 
374
- def connect_rpyc_service(host, port):
375
- repeat_count = 0
376
- while repeat_count < 20:
377
- try:
378
- con = rpyc.connect(
379
- host,
380
- port,
381
- config={
382
- "allow_public_attrs": True,
383
- "allow_pickle": True,
384
- "sync_request_timeout": 3600,
385
- },
386
- )
387
- break
388
- except ConnectionRefusedError as e:
389
- time.sleep(1)
390
- repeat_count += 1
391
- if repeat_count == 20:
392
- raise RuntimeError(f"Connect rpyc error: {e}")
393
-
394
- return con.root
395
-
396
-
397
- def start_rpyc_service(service: rpyc.Service, port: int):
398
- t = ThreadedServer(
399
- service=service,
400
- port=port,
401
- protocol_config={
402
- "allow_public_attrs": True,
403
- "allow_pickle": True,
404
- "sync_request_timeout": 3600,
405
- },
406
- )
407
- t.logger.setLevel(logging.WARN)
408
- t.start()
409
-
410
-
411
- def start_rpyc_service_process(service: rpyc.Service, port: int):
412
- proc = multiprocessing.Process(target=start_rpyc_service, args=(service, port))
413
- proc.start()
414
- return proc
415
-
416
-
417
307
  def suppress_other_loggers():
418
308
  from vllm.logger import logger as vllm_default_logger
419
309
 
@@ -445,7 +335,7 @@ def kill_parent_process():
445
335
  """Kill the parent process and all children of the parent process."""
446
336
  current_process = psutil.Process()
447
337
  parent_process = current_process.parent()
448
- children = current_process.children(recursive=True)
338
+ children = parent_process.children(recursive=True)
449
339
  for child in children:
450
340
  if child.pid != current_process.pid:
451
341
  os.kill(child.pid, 9)
@@ -559,7 +449,6 @@ def get_ip_address(ifname):
559
449
 
560
450
  def send_addrs_to_rank_0(model_port_args, server_args):
561
451
  assert server_args.node_rank != 0 and server_args.dp_size == 1
562
- import torch.distributed as dist
563
452
 
564
453
  ifname = os.environ.get(
565
454
  "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
@@ -591,7 +480,6 @@ def send_addrs_to_rank_0(model_port_args, server_args):
591
480
 
592
481
  def receive_addrs(model_port_args, server_args):
593
482
  assert server_args.node_rank == 0 and server_args.dp_size == 1
594
- import torch.distributed as dist
595
483
 
596
484
  ifname = os.environ.get(
597
485
  "SGLANG_SOCKET_IFNAME", os.environ.get("NCCL_SOCKET_IFNAME", "eth0")
@@ -624,3 +512,14 @@ def receive_addrs(model_port_args, server_args):
624
512
 
625
513
  dist.barrier()
626
514
  dist.destroy_process_group()
515
+
516
+
517
+ def set_ulimit(target_soft_limit=65535):
518
+ resource_type = resource.RLIMIT_NOFILE
519
+ current_soft, current_hard = resource.getrlimit(resource_type)
520
+
521
+ if current_soft < target_soft_limit:
522
+ try:
523
+ resource.setrlimit(resource_type, (target_soft_limit, current_hard))
524
+ except ValueError as e:
525
+ logger.warn(f"Fail to set RLIMIT_NOFILE: {e}")
@@ -1,5 +1,5 @@
1
1
  from sglang.srt.conversation import generate_chat_conv
2
- from sglang.srt.managers.openai_protocol import (
2
+ from sglang.srt.managers.openai_api.protocol import (
3
3
  ChatCompletionMessageContentImagePart,
4
4
  ChatCompletionMessageContentImageURL,
5
5
  ChatCompletionMessageContentTextPart,
@@ -1,4 +1,4 @@
1
- from sglang.srt.managers.openai_protocol import (
1
+ from sglang.srt.managers.openai_api.protocol import (
2
2
  ChatCompletionMessageContentImagePart,
3
3
  ChatCompletionMessageContentImageURL,
4
4
  ChatCompletionMessageContentTextPart,
@@ -306,7 +306,7 @@ def test_image_qa():
306
306
  assert (
307
307
  "taxi" in state.messages()[-1]["content"]
308
308
  or "car" in state.messages()[-1]["content"]
309
- )
309
+ ), f"{state.messages()[-1]['content']}"
310
310
 
311
311
 
312
312
  def test_stream():
sglang/test/test_utils.py CHANGED
@@ -6,9 +6,9 @@ from functools import partial
6
6
  import numpy as np
7
7
  import requests
8
8
 
9
- from sglang.backend.openai import OpenAI
10
- from sglang.backend.runtime_endpoint import RuntimeEndpoint
11
9
  from sglang.global_config import global_config
10
+ from sglang.lang.backend.openai import OpenAI
11
+ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
12
12
  from sglang.utils import get_exception_traceback
13
13
 
14
14
 
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.21
4
- Summary: A structured generation langauge for LLMs.
3
+ Version: 0.1.22
4
+ Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
7
7
  http://www.apache.org/licenses/
@@ -236,7 +236,6 @@ Requires-Dist: packaging ; extra == 'srt'
236
236
  Requires-Dist: pillow ; extra == 'srt'
237
237
  Requires-Dist: psutil ; extra == 'srt'
238
238
  Requires-Dist: pydantic ; extra == 'srt'
239
- Requires-Dist: rpyc ; extra == 'srt'
240
239
  Requires-Dist: torch ; extra == 'srt'
241
240
  Requires-Dist: uvicorn ; extra == 'srt'
242
241
  Requires-Dist: uvloop ; extra == 'srt'
@@ -252,23 +251,29 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
252
251
 
253
252
  | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
254
253
 
255
- SGLang is a structured generation language designed for large language models (LLMs).
256
- It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
254
+ SGLang is a fast serving framework for large language models and vision language models.
255
+ It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
257
256
 
258
257
  The core features include:
258
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
259
259
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
- - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
261
260
 
262
261
  ## News
262
+ - [2024/04] 🔥 SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
263
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
264
- - [2024/01] 🔥 SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
265
264
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
266
265
 
266
+ <details>
267
+ <summary>More</summary>
268
+
269
+ - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
270
+
271
+ </details>
272
+
267
273
  ## Contents
268
274
  - [Install](#install)
269
- - [Quick Start](#quick-start)
270
- - [Frontend: Structured Generation Language (SGLang)](#frontend-structured-generation-language-sglang)
271
275
  - [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
276
+ - [Frontend: Structured Generation Language (SGLang)](#frontend-structured-generation-language-sglang)
272
277
  - [Benchmark And Performance](#benchmark-and-performance)
273
278
  - [Roadmap](#roadmap)
274
279
  - [Citation And Acknowledgment](#citation-and-acknowledgment)
@@ -297,6 +302,16 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
297
302
  ### Method 3: Using docker
298
303
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
299
304
 
305
+ ```bash
306
+ docker run --gpus all \
307
+ -p 30000:30000 \
308
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
309
+ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
310
+ --ipc=host \
311
+ lmsysorg/sglang:latest \
312
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
313
+ ```
314
+
300
315
  ### Common Notes
301
316
  - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
302
317
  ```
@@ -306,13 +321,129 @@ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/
306
321
  - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
307
322
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
308
323
 
309
- ## Quick Start
324
+ ## Backend: SGLang Runtime (SRT)
325
+ The SGLang Runtime (SRT) is an efficient serving engine.
326
+
327
+ ### Quick Start
328
+ Launch a server
329
+ ```
330
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
331
+ ```
332
+
333
+ Send a request
334
+ ```
335
+ curl http://localhost:30000/generate \
336
+ -H "Content-Type: application/json" \
337
+ -d '{
338
+ "text": "Once upon a time,",
339
+ "sampling_params": {
340
+ "max_new_tokens": 16,
341
+ "temperature": 0
342
+ }
343
+ }'
344
+ ```
345
+ Learn more about the argument format [here](docs/sampling_params.md).
346
+
347
+ ### OpenAI Compatible API
348
+ In addition, the server supports OpenAI-compatible APIs.
349
+
350
+ ```python
351
+ import openai
352
+ client = openai.Client(
353
+ base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
354
+
355
+ # Text completion
356
+ response = client.completions.create(
357
+ model="default",
358
+ prompt="The capital of France is",
359
+ temperature=0,
360
+ max_tokens=32,
361
+ )
362
+ print(response)
363
+
364
+ # Chat completion
365
+ response = client.chat.completions.create(
366
+ model="default",
367
+ messages=[
368
+ {"role": "system", "content": "You are a helpful AI assistant"},
369
+ {"role": "user", "content": "List 3 countries and their capitals."},
370
+ ],
371
+ temperature=0,
372
+ max_tokens=64,
373
+ )
374
+ print(response)
375
+ ```
376
+
377
+ It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
378
+
379
+ ### Additional Server Arguments
380
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
381
+ ```
382
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
383
+ ```
384
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
385
+ ```
386
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
387
+ ```
388
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
389
+ ```
390
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
391
+ ```
392
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
393
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
394
+ ```
395
+ # Node 0
396
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
397
+
398
+ # Node 1
399
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
400
+ ```
401
+ - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
402
+
403
+ ### Supported Models
404
+
405
+ - Llama / Llama 2 / Llama 3
406
+ - Mistral / Mixtral
407
+ - Gemma / Gemma 2
408
+ - Qwen / Qwen 2 / Qwen 2 MoE
409
+ - LLaVA 1.5 / 1.6
410
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
411
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
412
+ - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
413
+ - LLaVA-NeXT-Video
414
+ - see [examples/usage/llava_video](examples/usage/llava_video)
415
+ - Yi-VL
416
+ - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
417
+ - StableLM
418
+ - Command-R
419
+ - DBRX
420
+ - Grok
421
+ - ChatGLM
422
+ - InternLM 2
423
+
424
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
425
+
426
+ ### Benchmark Performance
427
+
428
+ - Benchmark a single static batch. Run the following command without launching a server. The arguments are the same as those for `launch_server.py`.
429
+ ```
430
+ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
431
+ ```
432
+ - Benchmark online serving. Launch a server first and run the following command.
433
+ ```
434
+ python3 -m sglang.bench_serving --backend sglang --num-prompt 10
435
+ ```
436
+
437
+ ## Frontend: Structured Generation Language (SGLang)
438
+ The frontend language can be used with local models or API models.
439
+
440
+ ### Quick Start
310
441
  The example below shows how to use sglang to answer a mulit-turn question.
311
442
 
312
- ### Using Local Models
443
+ #### Using Local Models
313
444
  First, launch a server with
314
445
  ```
315
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
446
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
316
447
  ```
317
448
 
318
449
  Then, connect to the server and answer a multi-turn question.
@@ -341,7 +472,7 @@ for m in state.messages():
341
472
  print(state["answer_1"])
342
473
  ```
343
474
 
344
- ### Using OpenAI Models
475
+ #### Using OpenAI Models
345
476
  Set the OpenAI API Key
346
477
  ```
347
478
  export OPENAI_API_KEY=sk-******
@@ -372,13 +503,12 @@ for m in state.messages():
372
503
  print(state["answer_1"])
373
504
  ```
374
505
 
375
- ### More Examples
506
+ #### More Examples
376
507
 
377
508
  Anthropic and VertexAI (Gemini) models are also supported.
378
509
  You can find more examples at [examples/quick_start](examples/quick_start).
379
510
 
380
- ## Frontend: Structured Generation Language (SGLang)
381
-
511
+ ### Language Feature
382
512
  To begin with, import sglang.
383
513
  ```python
384
514
  import sglang as sgl
@@ -391,7 +521,7 @@ The system will manage the state, chat template, parallelism and batching for yo
391
521
 
392
522
  The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
393
523
 
394
- ### Control Flow
524
+ #### Control Flow
395
525
  You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
396
526
 
397
527
  ```python
@@ -406,7 +536,7 @@ def tool_use(s, question):
406
536
  s += "The key word to search is" + sgl.gen("word")
407
537
  ```
408
538
 
409
- ### Parallelism
539
+ #### Parallelism
410
540
  Use `fork` to launch parallel prompts.
411
541
  Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
412
542
 
@@ -428,7 +558,7 @@ def tip_suggestion(s):
428
558
  s += "In summary" + sgl.gen("summary")
429
559
  ```
430
560
 
431
- ### Multi Modality
561
+ #### Multi Modality
432
562
  Use `sgl.image` to pass an image as input.
433
563
 
434
564
  ```python
@@ -440,7 +570,7 @@ def image_qa(s, image_file, question):
440
570
 
441
571
  See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
442
572
 
443
- ### Constrained Decoding
573
+ #### Constrained Decoding
444
574
  Use `regex` to specify a regular expression as a decoding constraint.
445
575
  This is only supported for local models.
446
576
 
@@ -455,7 +585,7 @@ def regular_expression_gen(s):
455
585
  )
456
586
  ```
457
587
 
458
- ### JSON Decoding
588
+ #### JSON Decoding
459
589
  Use `regex` to specify a JSON schema with a regular expression.
460
590
 
461
591
  ```python
@@ -484,8 +614,7 @@ def character_gen(s, name):
484
614
 
485
615
  See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
486
616
 
487
-
488
- ### Batching
617
+ #### Batching
489
618
  Use `run_batch` to run a batch of requests with continuous batching.
490
619
 
491
620
  ```python
@@ -504,7 +633,7 @@ states = text_qa.run_batch(
504
633
  )
505
634
  ```
506
635
 
507
- ### Streaming
636
+ #### Streaming
508
637
  Add `stream=True` to enable streaming.
509
638
 
510
639
  ```python
@@ -523,139 +652,10 @@ for out in state.text_iter():
523
652
  print(out, end="", flush=True)
524
653
  ```
525
654
 
526
- ### Tips and Implementation Details
655
+ #### Tips and Implementation Details
527
656
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
528
657
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
529
658
 
530
- ## Backend: SGLang Runtime (SRT)
531
- The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
532
- However, it can also be used as a standalone API server.
533
- In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
534
-
535
- ### Usage
536
- Launch a server
537
- ```
538
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
539
- ```
540
-
541
- Send a request
542
- ```
543
- curl http://localhost:30000/generate \
544
- -H "Content-Type: application/json" \
545
- -d '{
546
- "text": "Once upon a time,",
547
- "sampling_params": {
548
- "max_new_tokens": 16,
549
- "temperature": 0
550
- }
551
- }'
552
- ```
553
- Learn more about the argument format [here](docs/sampling_params.md).
554
-
555
- ### OpenAI Compatible API
556
- In addition, the server supports an experimental OpenAI-compatible API.
557
-
558
- ```python
559
- import openai
560
- client = openai.Client(
561
- base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
562
-
563
- # Text completion
564
- response = client.completions.create(
565
- model="default",
566
- prompt="The capital of France is",
567
- temperature=0,
568
- max_tokens=32,
569
- )
570
- print(response)
571
-
572
- # Chat completion
573
- response = client.chat.completions.create(
574
- model="default",
575
- messages=[
576
- {"role": "system", "content": "You are a helpful AI assistant"},
577
- {"role": "user", "content": "List 3 countries and their capitals."},
578
- ],
579
- temperature=0,
580
- max_tokens=64,
581
- )
582
- print(response)
583
- ```
584
-
585
- By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
586
-
587
- If needed, you can also override the chat template when launching the server:
588
-
589
- ```
590
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
591
- ```
592
-
593
- If the chat template you are looking for is missing, you are welcome to contribute it.
594
- Meanwhile, you can also temporarily register your chat template as follows:
595
-
596
- ```json
597
- {
598
- "name": "my_model",
599
- "system": "<|im_start|>system",
600
- "user": "<|im_start|>user",
601
- "assistant": "<|im_start|>assistant",
602
- "sep_style": "CHATML",
603
- "sep": "<|im_end|>",
604
- "stop_str": ["<|im_end|>", "<|im_start|>"]
605
- }
606
- ```
607
-
608
- ```
609
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template ./my_model_template.json
610
- ```
611
-
612
- ### Additional Arguments
613
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
614
- ```
615
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
616
- ```
617
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
618
- ```
619
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
620
- ```
621
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
622
- ```
623
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
624
- ```
625
- - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
626
- - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
627
- ```
628
- # Node 0
629
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
630
-
631
- # Node 1
632
- python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
633
- ```
634
-
635
- ### Supported Models
636
- - Llama
637
- - Mistral
638
- - Mixtral
639
- - Qwen / Qwen 2 / Qwen 2 MoE
640
- - Gemma / Gemma 2
641
- - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
642
- - LLaVA
643
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
644
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
645
- - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
646
- - LLaVA-NeXT-Video
647
- - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
648
- - Yi-VL
649
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
650
- - StableLM
651
- - Command-R
652
- - DBRX
653
- - Grok
654
- - ChatGLM
655
- - AWQ/GPTQ/Marlin quantization
656
-
657
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
658
-
659
659
  ## Benchmark And Performance
660
660
  - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
661
661
  ![llama_7b](assets/llama_7b.jpg)
@@ -667,18 +667,8 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
667
667
  - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
668
668
 
669
669
  ## Roadmap
670
- https://github.com/sgl-project/sglang/issues/157
670
+ [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
671
671
 
672
672
  ## Citation And Acknowledgment
673
- ```
674
- @misc{zheng2024sglang,
675
- title={SGLang: Efficient Execution of Structured Language Model Programs},
676
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
677
- year={2024},
678
- eprint={2312.07104},
679
- archivePrefix={arXiv},
680
- primaryClass={cs.AI}
681
- }
682
- ```
683
-
684
- We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
673
+ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
674
+ We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).