sglang 0.2.13__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sglang/api.py +6 -0
  2. sglang/bench_latency.py +7 -3
  3. sglang/bench_serving.py +50 -26
  4. sglang/check_env.py +15 -0
  5. sglang/lang/chat_template.py +10 -5
  6. sglang/lang/compiler.py +4 -0
  7. sglang/lang/interpreter.py +1 -0
  8. sglang/lang/ir.py +9 -0
  9. sglang/launch_server.py +8 -1
  10. sglang/srt/constrained/fsm_cache.py +11 -2
  11. sglang/srt/constrained/jump_forward.py +1 -0
  12. sglang/srt/conversation.py +50 -1
  13. sglang/srt/hf_transformers_utils.py +22 -23
  14. sglang/srt/layers/activation.py +100 -1
  15. sglang/srt/layers/decode_attention.py +338 -50
  16. sglang/srt/layers/fused_moe/layer.py +2 -2
  17. sglang/srt/layers/logits_processor.py +56 -19
  18. sglang/srt/layers/radix_attention.py +3 -4
  19. sglang/srt/layers/sampler.py +101 -0
  20. sglang/srt/managers/controller_multi.py +2 -8
  21. sglang/srt/managers/controller_single.py +7 -10
  22. sglang/srt/managers/detokenizer_manager.py +20 -9
  23. sglang/srt/managers/io_struct.py +44 -11
  24. sglang/srt/managers/policy_scheduler.py +5 -2
  25. sglang/srt/managers/schedule_batch.py +46 -166
  26. sglang/srt/managers/tokenizer_manager.py +192 -83
  27. sglang/srt/managers/tp_worker.py +118 -24
  28. sglang/srt/mem_cache/memory_pool.py +82 -8
  29. sglang/srt/mm_utils.py +79 -7
  30. sglang/srt/model_executor/cuda_graph_runner.py +32 -8
  31. sglang/srt/model_executor/forward_batch_info.py +51 -26
  32. sglang/srt/model_executor/model_runner.py +201 -58
  33. sglang/srt/models/gemma2.py +10 -6
  34. sglang/srt/models/gpt_bigcode.py +1 -1
  35. sglang/srt/models/grok.py +11 -1
  36. sglang/srt/models/llama_embedding.py +4 -0
  37. sglang/srt/models/llava.py +176 -59
  38. sglang/srt/models/qwen2.py +9 -3
  39. sglang/srt/openai_api/adapter.py +200 -39
  40. sglang/srt/openai_api/protocol.py +2 -0
  41. sglang/srt/sampling/sampling_batch_info.py +136 -0
  42. sglang/srt/{sampling_params.py → sampling/sampling_params.py} +22 -0
  43. sglang/srt/server.py +92 -57
  44. sglang/srt/server_args.py +43 -15
  45. sglang/srt/utils.py +26 -16
  46. sglang/test/runners.py +22 -30
  47. sglang/test/simple_eval_common.py +9 -10
  48. sglang/test/simple_eval_gpqa.py +2 -1
  49. sglang/test/simple_eval_humaneval.py +2 -2
  50. sglang/test/simple_eval_math.py +2 -1
  51. sglang/test/simple_eval_mmlu.py +2 -1
  52. sglang/test/test_activation.py +55 -0
  53. sglang/test/test_utils.py +36 -53
  54. sglang/version.py +1 -1
  55. {sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/METADATA +100 -27
  56. sglang-0.2.14.post1.dist-info/RECORD +114 -0
  57. {sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/WHEEL +1 -1
  58. sglang/launch_server_llavavid.py +0 -29
  59. sglang-0.2.13.dist-info/RECORD +0 -112
  60. {sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/LICENSE +0 -0
  61. {sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,12 @@
1
1
  # Adapted from https://github.com/openai/simple-evals/
2
2
 
3
- import base64
4
3
  import os
5
4
  import resource
6
5
  import time
7
6
  from collections import defaultdict
8
7
  from dataclasses import dataclass, field
9
8
  from multiprocessing.pool import ThreadPool
10
- from typing import Any, Dict, List, Tuple
9
+ from typing import Any, Dict, List, Optional, Tuple
11
10
 
12
11
  import httpx
13
12
  import jinja2
@@ -44,8 +43,8 @@ class EvalResult:
44
43
  Result of running an evaluation (usually consisting of many samples)
45
44
  """
46
45
 
47
- score: float | None # top-line metric
48
- metrics: Dict[str, float] | None # other metrics
46
+ score: Optional[float] # top-line metric
47
+ metrics: Optional[Dict[str, float]] # other metrics
49
48
  htmls: List[str] # strings of valid HTML
50
49
  convos: List[MessageList] # sampled conversations
51
50
 
@@ -56,10 +55,10 @@ class SingleEvalResult:
56
55
  Result of evaluating a single sample
57
56
  """
58
57
 
59
- score: float | None
58
+ score: Optional[float]
60
59
  metrics: Dict[str, float] = field(default_factory=dict)
61
- html: str | None = None
62
- convo: MessageList | None = None # sampled conversation
60
+ html: Optional[str] = None
61
+ convo: Optional[MessageList] = None # sampled conversation
63
62
 
64
63
 
65
64
  class Eval:
@@ -89,8 +88,8 @@ class ChatCompletionSampler(SamplerBase):
89
88
  def __init__(
90
89
  self,
91
90
  base_url: str = None,
92
- model: str | None = None,
93
- system_message: str | None = None,
91
+ model: Optional[str] = None,
92
+ system_message: Optional[str] = None,
94
93
  temperature: float = 0.0,
95
94
  max_tokens: int = 2048,
96
95
  ):
@@ -272,7 +271,7 @@ def _compute_stat(values: list, stat: str):
272
271
  def aggregate_results(
273
272
  single_eval_results: List[SingleEvalResult],
274
273
  default_stats: Tuple[str] = ("mean", "std"),
275
- name2stats: Dict[str, Tuple[str]] | None = None,
274
+ name2stats: Optional[Dict[str, Tuple[str]]] = None,
276
275
  ) -> EvalResult:
277
276
  """
278
277
  Aggregate results from multiple evaluations into a single EvalResult.
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2311.12022
8
8
 
9
9
  import random
10
10
  import re
11
+ from typing import Optional
11
12
 
12
13
  import pandas
13
14
 
@@ -28,7 +29,7 @@ class GPQAEval(Eval):
28
29
  def __init__(
29
30
  self,
30
31
  filename: str,
31
- num_examples: int | None,
32
+ num_examples: Optional[int],
32
33
  num_threads: int,
33
34
  n_repeats: int = 1,
34
35
  ):
@@ -9,7 +9,7 @@ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
9
9
  import random
10
10
  import re
11
11
  from concurrent.futures import ThreadPoolExecutor, as_completed
12
- from typing import Dict, List
12
+ from typing import Dict, List, Optional
13
13
 
14
14
  import tqdm
15
15
 
@@ -61,7 +61,7 @@ def evaluate_functional_correctness(
61
61
  class HumanEval(Eval):
62
62
  def __init__(
63
63
  self,
64
- num_examples: int | None,
64
+ num_examples: Optional[int],
65
65
  num_threads: int,
66
66
  num_samples_per_task: int = 5,
67
67
  ks_passes: List[int] = [1, 2, 5],
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2103.03874
8
8
 
9
9
  import random
10
10
  import re
11
+ from typing import Optional
11
12
 
12
13
  import pandas
13
14
 
@@ -36,7 +37,7 @@ class MathEval(Eval):
36
37
  self,
37
38
  filename: str,
38
39
  equality_checker: SamplerBase,
39
- num_examples: int | None,
40
+ num_examples: Optional[int],
40
41
  num_threads: int,
41
42
  ):
42
43
  df = pandas.read_csv(filename)
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2009.03300
8
8
 
9
9
  import random
10
10
  import re
11
+ from typing import Optional
11
12
 
12
13
  import pandas
13
14
 
@@ -84,7 +85,7 @@ subject2category = {
84
85
 
85
86
 
86
87
  class MMLUEval(Eval):
87
- def __init__(self, filename: str, num_examples: int | None, num_threads: int):
88
+ def __init__(self, filename: str, num_examples: Optional[int], num_threads: int):
88
89
  df = pandas.read_csv(filename)
89
90
  examples = [row.to_dict() for _, row in df.iterrows()]
90
91
  if num_examples:
@@ -0,0 +1,55 @@
1
+ import itertools
2
+ import unittest
3
+
4
+ import torch
5
+
6
+ from sglang.srt.layers.activation import GeluAndMul
7
+
8
+
9
+ class TestGeluAndMul(unittest.TestCase):
10
+ DTYPES = [torch.half, torch.bfloat16]
11
+ NUM_TOKENS = [7, 83, 2048]
12
+ D = [512, 4096, 5120, 13824]
13
+ SEEDS = [0]
14
+
15
+ @classmethod
16
+ def setUpClass(cls):
17
+ if not torch.cuda.is_available():
18
+ raise unittest.SkipTest("CUDA is not available")
19
+ torch.set_default_device("cuda")
20
+
21
+ def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed):
22
+ torch.manual_seed(seed)
23
+
24
+ layer = GeluAndMul().to(dtype=dtype)
25
+ x = torch.randn(num_tokens, 2 * d, dtype=dtype)
26
+
27
+ with torch.inference_mode():
28
+ ref_out = layer.forward_native(x)
29
+ out = layer.forward_cuda(x)
30
+
31
+ if dtype == torch.bfloat16:
32
+ atol = rtol = 1e-2
33
+ else:
34
+ atol = rtol = 1e-3
35
+
36
+ self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol))
37
+
38
+ def test_gelu_and_mul(self):
39
+ for params in itertools.product(
40
+ self.NUM_TOKENS,
41
+ self.D,
42
+ self.DTYPES,
43
+ self.SEEDS,
44
+ ):
45
+ with self.subTest(
46
+ num_tokens=params[0],
47
+ d=params[1],
48
+ dtype=params[2],
49
+ seed=params[3],
50
+ ):
51
+ self._run_gelu_and_mul_test(*params)
52
+
53
+
54
+ if __name__ == "__main__":
55
+ unittest.main(verbosity=2)
sglang/test/test_utils.py CHANGED
@@ -2,11 +2,10 @@
2
2
 
3
3
  import argparse
4
4
  import asyncio
5
- import multiprocessing
5
+ import os
6
6
  import subprocess
7
7
  import threading
8
8
  import time
9
- import unittest
10
9
  from functools import partial
11
10
  from typing import Callable, List, Optional
12
11
 
@@ -18,14 +17,19 @@ import torch.nn.functional as F
18
17
  from sglang.global_config import global_config
19
18
  from sglang.lang.backend.openai import OpenAI
20
19
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
20
+ from sglang.srt.utils import kill_child_process
21
21
  from sglang.utils import get_exception_traceback
22
22
 
23
23
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
24
24
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
25
- DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157"
26
- DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157"
27
- DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157"
28
- DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157"
25
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
26
+
27
+ if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
28
+ DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
29
+ DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
30
+ else:
31
+ DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
32
+ DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
29
33
 
30
34
 
31
35
  def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -104,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
104
108
  return pred
105
109
 
106
110
 
107
- def call_generate_ginfer(prompt, temperature, max_tokens, stop=None, url=None):
108
- import grpc
109
- from ginfer import sampler_pb2, sampler_pb2_grpc
110
-
111
- sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
112
- sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
113
-
114
- if stop is None:
115
- stop_strings = None
116
- else:
117
- stop_strings = [stop]
118
-
119
- sample_request = sampler_pb2.SampleTextRequest(
120
- prompt=prompt,
121
- settings=sampler_pb2.SampleSettings(
122
- max_len=max_tokens,
123
- rng_seed=0,
124
- temperature=max(temperature, 1e-7),
125
- nucleus_p=1,
126
- stop_strings=stop_strings,
127
- ),
128
- )
129
- stream = sampler.SampleText(sample_request)
130
- response = "".join([x.text for x in stream])
131
- return response
111
+ def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
112
+ raise NotImplementedError()
132
113
 
133
114
 
134
115
  def call_generate_guidance(
@@ -271,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
271
252
  "vllm",
272
253
  "outlines",
273
254
  "lightllm",
274
- "ginfer",
255
+ "gserver",
275
256
  "guidance",
276
257
  "lmql",
277
258
  "srt-raw",
@@ -292,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
292
273
  "lightllm": 22000,
293
274
  "lmql": 23000,
294
275
  "srt-raw": 30000,
295
- "ginfer": 9988,
276
+ "gserver": 9988,
296
277
  }
297
278
  args.port = default_port.get(args.backend, None)
298
279
  return args
@@ -328,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
328
309
  return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
329
310
  elif args.backend == "srt-raw":
330
311
  return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
331
- elif args.backend == "ginfer":
332
- return partial(call_generate_ginfer, url=f"{args.host}:{args.port}")
312
+ elif args.backend == "gserver":
313
+ return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
333
314
  elif args.backend == "outlines":
334
315
  return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
335
316
  elif args.backend == "guidance":
@@ -480,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
480
461
  success = True
481
462
 
482
463
  for filename in files:
464
+ global process
483
465
 
484
- def func():
485
- print(f"\n\nRun {filename}\n\n")
486
- ret = unittest.main(module=None, argv=["", "-vb"] + [filename])
487
-
488
- p = multiprocessing.Process(target=func)
489
-
490
- def run_one_file():
491
- p.start()
492
- p.join()
466
+ def run_one_file(filename):
467
+ filename = os.path.join(os.getcwd(), filename)
468
+ print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
469
+ process = subprocess.Popen(
470
+ ["python3", filename], stdout=None, stderr=None, env=os.environ
471
+ )
472
+ process.wait()
473
+ return process.returncode
493
474
 
494
475
  try:
495
- run_with_timeout(run_one_file, timeout=timeout_per_file)
496
- if p.exitcode != 0:
497
- success = False
498
- break
476
+ ret_code = run_with_timeout(
477
+ run_one_file, args=(filename,), timeout=timeout_per_file
478
+ )
479
+ assert ret_code == 0
499
480
  except TimeoutError:
500
- p.terminate()
481
+ kill_child_process(process.pid)
501
482
  time.sleep(5)
502
483
  print(
503
- f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
484
+ f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
485
+ flush=True,
504
486
  )
505
- return False
487
+ success = False
488
+ break
506
489
 
507
490
  if success:
508
- print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
491
+ print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
509
492
  else:
510
- print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
493
+ print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
511
494
 
512
495
  return 0 if success else -1
513
496
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.13"
1
+ __version__ = "0.2.14.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.13
3
+ Version: 0.2.14.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -231,6 +231,7 @@ Requires-Dist: openai>=1.0; extra == "openai"
231
231
  Requires-Dist: tiktoken; extra == "openai"
232
232
  Provides-Extra: srt
233
233
  Requires-Dist: aiohttp; extra == "srt"
234
+ Requires-Dist: decord; extra == "srt"
234
235
  Requires-Dist: fastapi; extra == "srt"
235
236
  Requires-Dist: hf-transfer; extra == "srt"
236
237
  Requires-Dist: huggingface-hub; extra == "srt"
@@ -244,12 +245,14 @@ Requires-Dist: torch; extra == "srt"
244
245
  Requires-Dist: uvicorn; extra == "srt"
245
246
  Requires-Dist: uvloop; extra == "srt"
246
247
  Requires-Dist: zmq; extra == "srt"
247
- Requires-Dist: vllm==0.5.4; extra == "srt"
248
+ Requires-Dist: vllm==0.5.5; extra == "srt"
248
249
  Requires-Dist: outlines>=0.0.44; extra == "srt"
249
250
  Provides-Extra: test
250
251
  Requires-Dist: jsonlines; extra == "test"
251
252
  Requires-Dist: matplotlib; extra == "test"
252
253
  Requires-Dist: pandas; extra == "test"
254
+ Requires-Dist: sentence-transformers; extra == "test"
255
+ Requires-Dist: accelerate; extra == "test"
253
256
 
254
257
  <div align="center">
255
258
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
270
273
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
271
274
 
272
275
  The core features include:
273
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, flashinfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
276
+ - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
274
277
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
275
278
 
276
279
  ## News
277
280
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
278
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
281
+ - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
279
282
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
280
283
 
281
284
  <details>
282
285
  <summary>More</summary>
283
286
 
287
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
284
288
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
285
289
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
286
290
 
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
308
312
  ### Method 2: From source
309
313
  ```
310
314
  # Use the last release branch
311
- git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
312
316
  cd sglang
313
317
 
314
318
  pip install --upgrade pip
@@ -334,14 +338,60 @@ docker run --gpus all \
334
338
 
335
339
  ### Method 4: Using docker compose
336
340
 
341
+ <details>
342
+ <summary>More</summary>
343
+
337
344
  > This method is recommended if you plan to serve it as a service.
338
345
  > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
339
346
 
340
347
  1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
341
348
  2. Execute the command `docker compose up -d` in your terminal.
349
+ </details>
350
+
351
+ ### Method 5: Run on Kubernetes or Clouds with SkyPilot
352
+
353
+ <details>
354
+ <summary>More</summary>
355
+
356
+ To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
357
+
358
+ 1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
359
+ 2. Deploy on your own infra with a single command and get the HTTP API endpoint:
360
+ <details>
361
+ <summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
362
+
363
+ ```yaml
364
+ # sglang.yaml
365
+ envs:
366
+ HF_TOKEN: null
367
+
368
+ resources:
369
+ image_id: docker:lmsysorg/sglang:latest
370
+ accelerators: A100
371
+ ports: 30000
372
+
373
+ run: |
374
+ conda deactivate
375
+ python3 -m sglang.launch_server \
376
+ --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
377
+ --host 0.0.0.0 \
378
+ --port 30000
379
+ ```
380
+ </details>
381
+
382
+ ```bash
383
+ # Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
384
+ HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
385
+
386
+ # Get the HTTP API endpoint
387
+ sky status --endpoint 30000 sglang
388
+ ```
389
+ 3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
390
+ </details>
391
+
342
392
 
343
393
  ### Common Notes
344
- - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
394
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
345
395
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
346
396
 
347
397
  ## Backend: SGLang Runtime (SRT)
@@ -395,6 +445,13 @@ response = client.chat.completions.create(
395
445
  max_tokens=64,
396
446
  )
397
447
  print(response)
448
+
449
+ # Text embedding
450
+ response = client.embeddings.create(
451
+ model="default",
452
+ input="How are you today",
453
+ )
454
+ print(response)
398
455
  ```
399
456
 
400
457
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
@@ -431,19 +488,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
431
488
 
432
489
  ### Supported Models
433
490
 
491
+ **Generative Models**
492
+
434
493
  - Llama / Llama 2 / Llama 3 / Llama 3.1
435
494
  - Mistral / Mixtral / Mistral NeMo
436
495
  - Gemma / Gemma 2
437
496
  - Qwen / Qwen 2 / Qwen 2 MoE
438
497
  - DeepSeek / DeepSeek 2
439
- - LLaVA 1.5 / 1.6
440
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
441
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
442
- - `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 30000`
443
- - LLaVA-NeXT-Video
444
- - see [examples/usage/llava_video](examples/usage/llava_video)
498
+ - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
499
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
500
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
501
+ - LLaVA 1.5 / 1.6 / NeXT
502
+ - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
503
+ - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
504
+ - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
445
505
  - Yi-VL
446
- - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
447
506
  - StableLM
448
507
  - Command-R
449
508
  - DBRX
@@ -451,37 +510,52 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
451
510
  - ChatGLM
452
511
  - InternLM 2
453
512
 
513
+ **Embedding Models**
514
+
515
+ - e5-mistral
516
+ - gte-Qwen2
517
+ - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
518
+
454
519
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
455
520
 
456
521
  #### Use Models From ModelScope
457
- To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
522
+ <details>
523
+ <summary>More</summary>
524
+
525
+ To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
458
526
  ```
459
527
  export SGLANG_USE_MODELSCOPE=true
460
528
  ```
461
529
  Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
462
530
  ```
463
531
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
464
- ```
532
+ ```
533
+
534
+ </details>
465
535
 
466
536
  #### Run Llama 3.1 405B
537
+ <details>
538
+ <summary>More</summary>
467
539
 
468
540
  ```bash
469
- ## Run 405B (fp8) on a single node
541
+ # Run 405B (fp8) on a single node
470
542
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
471
543
 
472
- ## Run 405B (fp16) on two nodes
473
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
474
-
475
- # on the first node
476
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
544
+ # Run 405B (fp16) on two nodes
545
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
546
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
477
547
 
478
- # on the second
479
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
548
+ ## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
549
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
480
550
  ```
481
551
 
552
+ </details>
553
+
482
554
  ### Benchmark Performance
483
555
 
484
- - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
556
+ - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
557
+ Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
558
+ A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
485
559
  ```
486
560
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
487
561
  ```
@@ -614,7 +688,7 @@ def tip_suggestion(s):
614
688
  s += "In summary" + sgl.gen("summary")
615
689
  ```
616
690
 
617
- #### Multi Modality
691
+ #### Multi-Modality
618
692
  Use `sgl.image` to pass an image as input.
619
693
 
620
694
  ```python
@@ -668,7 +742,7 @@ def character_gen(s, name):
668
742
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
669
743
  ```
670
744
 
671
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example on specifying formats with Pydantic models.
745
+ See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
672
746
 
673
747
  #### Batching
674
748
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -730,7 +804,6 @@ def chat_example(s):
730
804
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
731
805
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
732
806
 
733
-
734
807
  ## Benchmark And Performance
735
808
  ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
736
809
  ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)