sglang 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {sglang-0.1.2 → sglang-0.1.4}/PKG-INFO +101 -5
  2. {sglang-0.1.2 → sglang-0.1.4}/README.md +100 -4
  3. {sglang-0.1.2 → sglang-0.1.4}/pyproject.toml +1 -1
  4. {sglang-0.1.2 → sglang-0.1.4}/sglang/__init__.py +1 -1
  5. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/context_flashattention_nopad.py +8 -1
  6. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/extend_attention.py +47 -1
  7. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/model_rpc.py +2 -1
  8. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/utils.py +1 -1
  9. {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/PKG-INFO +101 -5
  10. {sglang-0.1.2 → sglang-0.1.4}/LICENSE +0 -0
  11. {sglang-0.1.2 → sglang-0.1.4}/setup.cfg +0 -0
  12. {sglang-0.1.2 → sglang-0.1.4}/sglang/api.py +0 -0
  13. {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/__init__.py +0 -0
  14. {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/anthropic.py +0 -0
  15. {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/base_backend.py +0 -0
  16. {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/huggingface.py +0 -0
  17. {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/openai.py +0 -0
  18. {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/runtime_endpoint.py +0 -0
  19. {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/tgi.py +0 -0
  20. {sglang-0.1.2 → sglang-0.1.4}/sglang/flush_cache.py +0 -0
  21. {sglang-0.1.2 → sglang-0.1.4}/sglang/global_config.py +0 -0
  22. {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/__init__.py +0 -0
  23. {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/chat_template.py +0 -0
  24. {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/compiler.py +0 -0
  25. {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/interpreter.py +0 -0
  26. {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/ir.py +0 -0
  27. {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/tracer.py +0 -0
  28. {sglang-0.1.2 → sglang-0.1.4}/sglang/launch_server.py +0 -0
  29. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/backend_config.py +0 -0
  30. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/constrained/fsm.py +0 -0
  31. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/constrained/fsm_cache.py +0 -0
  32. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/constrained/regex.py +0 -0
  33. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/constrained/tokenizer.py +0 -0
  34. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/hf_transformers_utils.py +0 -0
  35. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/get_selected_logprob.py +0 -0
  36. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/logits_processor.py +0 -0
  37. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/radix_attention.py +0 -0
  38. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/token_attention.py +0 -0
  39. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/detokenizer_manager.py +0 -0
  40. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/io_struct.py +0 -0
  41. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/openai_protocol.py +0 -0
  42. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/infer_batch.py +0 -0
  43. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/manager.py +0 -0
  44. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/model_runner.py +0 -0
  45. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/radix_cache.py +0 -0
  46. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/scheduler.py +0 -0
  47. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/tokenizer_manager.py +0 -0
  48. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/memory_pool.py +0 -0
  49. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/model_config.py +0 -0
  50. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/models/llama2.py +0 -0
  51. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/models/llava.py +0 -0
  52. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/models/mixtral.py +0 -0
  53. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/sampling_params.py +0 -0
  54. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/server.py +0 -0
  55. {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/server_args.py +0 -0
  56. {sglang-0.1.2 → sglang-0.1.4}/sglang/test/test_programs.py +0 -0
  57. {sglang-0.1.2 → sglang-0.1.4}/sglang/test/test_utils.py +0 -0
  58. {sglang-0.1.2 → sglang-0.1.4}/sglang/utils.py +0 -0
  59. {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/SOURCES.txt +0 -0
  60. {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/dependency_links.txt +0 -0
  61. {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/requires.txt +0 -0
  62. {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -267,10 +267,20 @@ pip install --upgrade pip
267
267
  pip install -e "python[all]"
268
268
  ```
269
269
 
270
+ ### Notes
271
+ - If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
272
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
273
+
270
274
  ## Quick Start
271
275
  The example below shows how to use sglang to answer a mulit-turn question.
272
276
 
273
277
  ### Using OpenAI Models
278
+ Set the OpenAI API Key
279
+ ```
280
+ export OPENAI_API_KEY=sk-xxxxxx
281
+ ```
282
+
283
+ Then, answer a multi-turn question.
274
284
  ```python
275
285
  from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
276
286
 
@@ -329,30 +339,104 @@ You can find more examples at [examples/quick_start](examples/quick_start).
329
339
 
330
340
  ## Frontend: Structured Generation Langauge (SGLang)
331
341
 
342
+ To begin with, import sglang.
343
+ ```python
344
+ import sglang as sgl
345
+ ```
346
+
347
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
348
+ You can implement your prompt flow in a function decorated by `sgl.function`.
349
+ You can then invoke the function with `run` or `run_batch`.
350
+ The system will manage the state, chat template, and parallelism for you.
351
+
332
352
  ### Control Flow
353
+ ```python
354
+ @sgl.function
355
+ def control_flow(s, question):
356
+ s += "To answer this question: " + question + ", "
357
+ s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
358
+
359
+ # You can use if or nested function calls
360
+ if s["tool"] == "calculator":
361
+ s += "The math expression is" + sgl.gen("expression")
362
+ elif s["tool"] == "web browser":
363
+ s += "The website url is" + sgl.gen("url")
364
+ ```
333
365
 
334
366
  ### Parallelism
367
+ ```python
368
+ @sgl.function
369
+ def tip_suggestion(s):
370
+ s += (
371
+ "Here are two tips for staying healthy: "
372
+ "1. Balanced Diet. 2. Regular Exercise.\n\n"
373
+ )
374
+
375
+ forks = s.fork(2) # Launch parallel prompts
376
+ for i, f in enumerate(forks):
377
+ f += f"Now, expand tip {i+1} into a paragraph:\n"
378
+ f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
379
+
380
+ s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
381
+ s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
382
+ s += "In summary" + sgl.gen("summary")
383
+ ```
335
384
 
336
385
  ### Multi Modality
337
386
  ```python
338
387
  @sgl.function
339
388
  def image_qa(s, image_file, question):
340
389
  s += sgl.user(sgl.image(image_file) + question)
341
- s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
390
+ s += sgl.assistant(sgl.gen("answer", max_tokens=256)
342
391
  ```
343
392
 
344
- ### Constrained decoding
393
+ ### Constrained Decoding
394
+ ```python
395
+ @sgl.function
396
+ def regular_expression_gen(s):
397
+ s += "Q: What is the IP address of the Google DNS servers?\n"
398
+ s += "A: " + sgl.gen(
399
+ "answer",
400
+ temperature=0,
401
+ regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
402
+ )
403
+ ```
345
404
 
346
405
  ### Batching
406
+ ```python
407
+ @sgl.function
408
+ def text_qa(s, question):
409
+ s += "Q: " + question + "\n"
410
+ s += "A:" + sgl.gen("answer", stop="\n")
411
+
412
+ states = text_qa.run_batch(
413
+ [
414
+ {"question": "What is the capital of the United Kingdom?"},
415
+ {"question": "What is the capital of France?"},
416
+ {"question": "What is the capital of Japan?"},
417
+ ],
418
+ )
419
+ ```
347
420
 
348
421
  ### Streaming
422
+ ```python
423
+ @sgl.function
424
+ def text_qa(s, question):
425
+ s += "Q: " + question + "\n"
426
+ s += "A:" + sgl.gen("answer", stop="\n")
349
427
 
350
- ### Other Backends
428
+ states = text_qa.run(
429
+ question="What is the capital of France?",
430
+ temperature=0.1)
431
+
432
+ for out in state.text_iter():
433
+ print(out, end="", flush=True)
434
+ ```
351
435
 
352
436
  ## Backend: SGLang Runtime (SRT)
353
437
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
354
438
  However, it can also be used as a standalone API server.
355
- In this case, the RadixAttention can still greatly accelerate many use cases.
439
+ In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
356
440
 
357
441
  ### Usage
358
442
  Launch a server
@@ -376,6 +460,10 @@ curl http://localhost:30000/v1/completions \
376
460
  ```
377
461
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
378
462
  ```
463
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
464
+ ```
465
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
466
+ ```
379
467
 
380
468
  ### Supported Models
381
469
  - Llama
@@ -386,6 +474,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
386
474
 
387
475
  ## Benchmark And Performance
388
476
 
477
+ - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
478
+ ![llama_7b](assets/llama_7b.jpg)
479
+
480
+ - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
481
+ ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
482
+
483
+ Learn more [here](docs/benchmark_results.md).
484
+
389
485
  ## Roadmap
390
486
  - [ ] Function call
391
487
  - [ ] Quantization
@@ -32,10 +32,20 @@ pip install --upgrade pip
32
32
  pip install -e "python[all]"
33
33
  ```
34
34
 
35
+ ### Notes
36
+ - If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
37
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
38
+
35
39
  ## Quick Start
36
40
  The example below shows how to use sglang to answer a mulit-turn question.
37
41
 
38
42
  ### Using OpenAI Models
43
+ Set the OpenAI API Key
44
+ ```
45
+ export OPENAI_API_KEY=sk-xxxxxx
46
+ ```
47
+
48
+ Then, answer a multi-turn question.
39
49
  ```python
40
50
  from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
41
51
 
@@ -94,30 +104,104 @@ You can find more examples at [examples/quick_start](examples/quick_start).
94
104
 
95
105
  ## Frontend: Structured Generation Langauge (SGLang)
96
106
 
107
+ To begin with, import sglang.
108
+ ```python
109
+ import sglang as sgl
110
+ ```
111
+
112
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
113
+ You can implement your prompt flow in a function decorated by `sgl.function`.
114
+ You can then invoke the function with `run` or `run_batch`.
115
+ The system will manage the state, chat template, and parallelism for you.
116
+
97
117
  ### Control Flow
118
+ ```python
119
+ @sgl.function
120
+ def control_flow(s, question):
121
+ s += "To answer this question: " + question + ", "
122
+ s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
123
+
124
+ # You can use if or nested function calls
125
+ if s["tool"] == "calculator":
126
+ s += "The math expression is" + sgl.gen("expression")
127
+ elif s["tool"] == "web browser":
128
+ s += "The website url is" + sgl.gen("url")
129
+ ```
98
130
 
99
131
  ### Parallelism
132
+ ```python
133
+ @sgl.function
134
+ def tip_suggestion(s):
135
+ s += (
136
+ "Here are two tips for staying healthy: "
137
+ "1. Balanced Diet. 2. Regular Exercise.\n\n"
138
+ )
139
+
140
+ forks = s.fork(2) # Launch parallel prompts
141
+ for i, f in enumerate(forks):
142
+ f += f"Now, expand tip {i+1} into a paragraph:\n"
143
+ f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
144
+
145
+ s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
146
+ s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
147
+ s += "In summary" + sgl.gen("summary")
148
+ ```
100
149
 
101
150
  ### Multi Modality
102
151
  ```python
103
152
  @sgl.function
104
153
  def image_qa(s, image_file, question):
105
154
  s += sgl.user(sgl.image(image_file) + question)
106
- s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
155
+ s += sgl.assistant(sgl.gen("answer", max_tokens=256)
107
156
  ```
108
157
 
109
- ### Constrained decoding
158
+ ### Constrained Decoding
159
+ ```python
160
+ @sgl.function
161
+ def regular_expression_gen(s):
162
+ s += "Q: What is the IP address of the Google DNS servers?\n"
163
+ s += "A: " + sgl.gen(
164
+ "answer",
165
+ temperature=0,
166
+ regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
167
+ )
168
+ ```
110
169
 
111
170
  ### Batching
171
+ ```python
172
+ @sgl.function
173
+ def text_qa(s, question):
174
+ s += "Q: " + question + "\n"
175
+ s += "A:" + sgl.gen("answer", stop="\n")
176
+
177
+ states = text_qa.run_batch(
178
+ [
179
+ {"question": "What is the capital of the United Kingdom?"},
180
+ {"question": "What is the capital of France?"},
181
+ {"question": "What is the capital of Japan?"},
182
+ ],
183
+ )
184
+ ```
112
185
 
113
186
  ### Streaming
187
+ ```python
188
+ @sgl.function
189
+ def text_qa(s, question):
190
+ s += "Q: " + question + "\n"
191
+ s += "A:" + sgl.gen("answer", stop="\n")
114
192
 
115
- ### Other Backends
193
+ states = text_qa.run(
194
+ question="What is the capital of France?",
195
+ temperature=0.1)
196
+
197
+ for out in state.text_iter():
198
+ print(out, end="", flush=True)
199
+ ```
116
200
 
117
201
  ## Backend: SGLang Runtime (SRT)
118
202
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
119
203
  However, it can also be used as a standalone API server.
120
- In this case, the RadixAttention can still greatly accelerate many use cases.
204
+ In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
121
205
 
122
206
  ### Usage
123
207
  Launch a server
@@ -141,6 +225,10 @@ curl http://localhost:30000/v1/completions \
141
225
  ```
142
226
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
143
227
  ```
228
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
229
+ ```
230
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
231
+ ```
144
232
 
145
233
  ### Supported Models
146
234
  - Llama
@@ -151,6 +239,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
151
239
 
152
240
  ## Benchmark And Performance
153
241
 
242
+ - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
243
+ ![llama_7b](assets/llama_7b.jpg)
244
+
245
+ - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
246
+ ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
247
+
248
+ Learn more [here](docs/benchmark_results.md).
249
+
154
250
  ## Roadmap
155
251
  - [ ] Function call
156
252
  - [ ] Quantization
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.2"
7
+ version = "0.1.4"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.2"
1
+ __version__ = "0.1.4"
2
2
 
3
3
  from sglang.api import *
4
4
  from sglang.global_config import global_config
@@ -6,6 +6,9 @@ import triton.language as tl
6
6
  from sglang.srt.utils import wrap_kernel_launcher
7
7
 
8
8
 
9
+ CUDA_CAPABILITY = torch.cuda.get_device_capability()
10
+
11
+
9
12
  @triton.jit
10
13
  def _fwd_kernel(
11
14
  Q,
@@ -120,7 +123,11 @@ cached_kernel = None
120
123
 
121
124
 
122
125
  def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
123
- BLOCK = 128
126
+ if CUDA_CAPABILITY[0] >= 8:
127
+ BLOCK = 128
128
+ else:
129
+ BLOCK = 64
130
+
124
131
  Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
125
132
  assert Lq == Lk and Lk == Lv
126
133
  assert Lk in {16, 32, 64, 128}
@@ -2,6 +2,10 @@ import torch
2
2
  import triton
3
3
  import triton.language as tl
4
4
  from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
5
+ from sglang.srt.utils import wrap_kernel_launcher
6
+
7
+
8
+ CUDA_CAPABILITY = torch.cuda.get_device_capability()
5
9
 
6
10
 
7
11
  @triton.jit
@@ -153,6 +157,9 @@ def _fwd_kernel(
153
157
  tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
154
158
 
155
159
 
160
+ cached_kernel = None
161
+
162
+
156
163
  def extend_attention_fwd(
157
164
  q_extend,
158
165
  k_extend,
@@ -175,7 +182,11 @@ def extend_attention_fwd(
175
182
 
176
183
  k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
177
184
  """
178
- BLOCK_M, BLOCK_N = 128, 128
185
+ if CUDA_CAPABILITY[0] >= 8:
186
+ BLOCK_M, BLOCK_N = 128, 128
187
+ else:
188
+ BLOCK_M, BLOCK_N = 64, 64
189
+
179
190
  Lq, Lk, Lv, Lo = (
180
191
  q_extend.shape[-1],
181
192
  k_extend.shape[-1],
@@ -193,6 +204,40 @@ def extend_attention_fwd(
193
204
  num_warps = 4 if Lk <= 64 else 8
194
205
  num_stages = 1
195
206
 
207
+ global cached_kernel
208
+ if cached_kernel:
209
+ cached_kernel(
210
+ grid,
211
+ num_warps,
212
+ q_extend,
213
+ k_extend,
214
+ v_extend,
215
+ o_extend,
216
+ k_buffer,
217
+ v_buffer,
218
+ req_to_tokens,
219
+ b_req_idx,
220
+ b_seq_len,
221
+ b_start_loc_extend,
222
+ b_seq_len_extend,
223
+ sm_scale,
224
+ kv_group_num,
225
+ q_extend.stride(0),
226
+ q_extend.stride(1),
227
+ k_extend.stride(0),
228
+ k_extend.stride(1),
229
+ v_extend.stride(0),
230
+ v_extend.stride(1),
231
+ o_extend.stride(0),
232
+ o_extend.stride(1),
233
+ k_buffer.stride(0),
234
+ k_buffer.stride(1),
235
+ v_buffer.stride(0),
236
+ v_buffer.stride(1),
237
+ req_to_tokens.stride(0),
238
+ )
239
+ return
240
+
196
241
  _fwd_kernel[grid](
197
242
  q_extend,
198
243
  k_extend,
@@ -226,6 +271,7 @@ def extend_attention_fwd(
226
271
  num_warps=num_warps,
227
272
  num_stages=num_stages,
228
273
  )
274
+ cached_kernel = wrap_kernel_launcher(_fwd_kernel)
229
275
 
230
276
 
231
277
  def redundant_attention(
@@ -5,6 +5,7 @@ import time
5
5
  from concurrent.futures import ThreadPoolExecutor
6
6
  from enum import Enum, auto
7
7
  from typing import Dict, List, Optional, Tuple, Union
8
+ import warnings
8
9
 
9
10
  import numpy as np
10
11
  import rpyc
@@ -164,7 +165,7 @@ class ModelRpcServer(rpyc.Service):
164
165
  + self.tree_cache.evictable_size()
165
166
  )
166
167
  if available_size != self.max_total_num_token:
167
- logger.warning(
168
+ warnings.warn(
168
169
  "Warning: "
169
170
  f"available_size={available_size}, max_total_num_token={self.max_total_num_token}\n"
170
171
  "KV cache pool leak detected!"
@@ -209,7 +209,7 @@ def load_image(image_file):
209
209
  elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
210
210
  image = Image.open(image_file)
211
211
  elif image_file.startswith("data:"):
212
- image_file = image_url.split(",")[1]
212
+ image_file = image_file.split(",")[1]
213
213
  image = Image.open(BytesIO(base64.b64decode(image_file)))
214
214
  else:
215
215
  image = Image.open(BytesIO(base64.b64decode(image_file)))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -267,10 +267,20 @@ pip install --upgrade pip
267
267
  pip install -e "python[all]"
268
268
  ```
269
269
 
270
+ ### Notes
271
+ - If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
272
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
273
+
270
274
  ## Quick Start
271
275
  The example below shows how to use sglang to answer a mulit-turn question.
272
276
 
273
277
  ### Using OpenAI Models
278
+ Set the OpenAI API Key
279
+ ```
280
+ export OPENAI_API_KEY=sk-xxxxxx
281
+ ```
282
+
283
+ Then, answer a multi-turn question.
274
284
  ```python
275
285
  from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
276
286
 
@@ -329,30 +339,104 @@ You can find more examples at [examples/quick_start](examples/quick_start).
329
339
 
330
340
  ## Frontend: Structured Generation Langauge (SGLang)
331
341
 
342
+ To begin with, import sglang.
343
+ ```python
344
+ import sglang as sgl
345
+ ```
346
+
347
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
348
+ You can implement your prompt flow in a function decorated by `sgl.function`.
349
+ You can then invoke the function with `run` or `run_batch`.
350
+ The system will manage the state, chat template, and parallelism for you.
351
+
332
352
  ### Control Flow
353
+ ```python
354
+ @sgl.function
355
+ def control_flow(s, question):
356
+ s += "To answer this question: " + question + ", "
357
+ s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
358
+
359
+ # You can use if or nested function calls
360
+ if s["tool"] == "calculator":
361
+ s += "The math expression is" + sgl.gen("expression")
362
+ elif s["tool"] == "web browser":
363
+ s += "The website url is" + sgl.gen("url")
364
+ ```
333
365
 
334
366
  ### Parallelism
367
+ ```python
368
+ @sgl.function
369
+ def tip_suggestion(s):
370
+ s += (
371
+ "Here are two tips for staying healthy: "
372
+ "1. Balanced Diet. 2. Regular Exercise.\n\n"
373
+ )
374
+
375
+ forks = s.fork(2) # Launch parallel prompts
376
+ for i, f in enumerate(forks):
377
+ f += f"Now, expand tip {i+1} into a paragraph:\n"
378
+ f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
379
+
380
+ s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
381
+ s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
382
+ s += "In summary" + sgl.gen("summary")
383
+ ```
335
384
 
336
385
  ### Multi Modality
337
386
  ```python
338
387
  @sgl.function
339
388
  def image_qa(s, image_file, question):
340
389
  s += sgl.user(sgl.image(image_file) + question)
341
- s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
390
+ s += sgl.assistant(sgl.gen("answer", max_tokens=256)
342
391
  ```
343
392
 
344
- ### Constrained decoding
393
+ ### Constrained Decoding
394
+ ```python
395
+ @sgl.function
396
+ def regular_expression_gen(s):
397
+ s += "Q: What is the IP address of the Google DNS servers?\n"
398
+ s += "A: " + sgl.gen(
399
+ "answer",
400
+ temperature=0,
401
+ regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
402
+ )
403
+ ```
345
404
 
346
405
  ### Batching
406
+ ```python
407
+ @sgl.function
408
+ def text_qa(s, question):
409
+ s += "Q: " + question + "\n"
410
+ s += "A:" + sgl.gen("answer", stop="\n")
411
+
412
+ states = text_qa.run_batch(
413
+ [
414
+ {"question": "What is the capital of the United Kingdom?"},
415
+ {"question": "What is the capital of France?"},
416
+ {"question": "What is the capital of Japan?"},
417
+ ],
418
+ )
419
+ ```
347
420
 
348
421
  ### Streaming
422
+ ```python
423
+ @sgl.function
424
+ def text_qa(s, question):
425
+ s += "Q: " + question + "\n"
426
+ s += "A:" + sgl.gen("answer", stop="\n")
349
427
 
350
- ### Other Backends
428
+ states = text_qa.run(
429
+ question="What is the capital of France?",
430
+ temperature=0.1)
431
+
432
+ for out in state.text_iter():
433
+ print(out, end="", flush=True)
434
+ ```
351
435
 
352
436
  ## Backend: SGLang Runtime (SRT)
353
437
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
354
438
  However, it can also be used as a standalone API server.
355
- In this case, the RadixAttention can still greatly accelerate many use cases.
439
+ In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
356
440
 
357
441
  ### Usage
358
442
  Launch a server
@@ -376,6 +460,10 @@ curl http://localhost:30000/v1/completions \
376
460
  ```
377
461
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
378
462
  ```
463
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
464
+ ```
465
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
466
+ ```
379
467
 
380
468
  ### Supported Models
381
469
  - Llama
@@ -386,6 +474,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
386
474
 
387
475
  ## Benchmark And Performance
388
476
 
477
+ - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
478
+ ![llama_7b](assets/llama_7b.jpg)
479
+
480
+ - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
481
+ ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
482
+
483
+ Learn more [here](docs/benchmark_results.md).
484
+
389
485
  ## Roadmap
390
486
  - [ ] Function call
391
487
  - [ ] Quantization
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes