sglang 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sglang/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.3"
1
+ __version__ = "0.1.4"
2
2
 
3
3
  from sglang.api import *
4
4
  from sglang.global_config import global_config
@@ -6,6 +6,9 @@ import triton.language as tl
6
6
  from sglang.srt.utils import wrap_kernel_launcher
7
7
 
8
8
 
9
+ CUDA_CAPABILITY = torch.cuda.get_device_capability()
10
+
11
+
9
12
  @triton.jit
10
13
  def _fwd_kernel(
11
14
  Q,
@@ -120,7 +123,11 @@ cached_kernel = None
120
123
 
121
124
 
122
125
  def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
123
- BLOCK = 128
126
+ if CUDA_CAPABILITY[0] >= 8:
127
+ BLOCK = 128
128
+ else:
129
+ BLOCK = 64
130
+
124
131
  Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
125
132
  assert Lq == Lk and Lk == Lv
126
133
  assert Lk in {16, 32, 64, 128}
@@ -2,6 +2,10 @@ import torch
2
2
  import triton
3
3
  import triton.language as tl
4
4
  from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
5
+ from sglang.srt.utils import wrap_kernel_launcher
6
+
7
+
8
+ CUDA_CAPABILITY = torch.cuda.get_device_capability()
5
9
 
6
10
 
7
11
  @triton.jit
@@ -153,6 +157,9 @@ def _fwd_kernel(
153
157
  tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
154
158
 
155
159
 
160
+ cached_kernel = None
161
+
162
+
156
163
  def extend_attention_fwd(
157
164
  q_extend,
158
165
  k_extend,
@@ -175,7 +182,11 @@ def extend_attention_fwd(
175
182
 
176
183
  k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
177
184
  """
178
- BLOCK_M, BLOCK_N = 128, 128
185
+ if CUDA_CAPABILITY[0] >= 8:
186
+ BLOCK_M, BLOCK_N = 128, 128
187
+ else:
188
+ BLOCK_M, BLOCK_N = 64, 64
189
+
179
190
  Lq, Lk, Lv, Lo = (
180
191
  q_extend.shape[-1],
181
192
  k_extend.shape[-1],
@@ -193,6 +204,40 @@ def extend_attention_fwd(
193
204
  num_warps = 4 if Lk <= 64 else 8
194
205
  num_stages = 1
195
206
 
207
+ global cached_kernel
208
+ if cached_kernel:
209
+ cached_kernel(
210
+ grid,
211
+ num_warps,
212
+ q_extend,
213
+ k_extend,
214
+ v_extend,
215
+ o_extend,
216
+ k_buffer,
217
+ v_buffer,
218
+ req_to_tokens,
219
+ b_req_idx,
220
+ b_seq_len,
221
+ b_start_loc_extend,
222
+ b_seq_len_extend,
223
+ sm_scale,
224
+ kv_group_num,
225
+ q_extend.stride(0),
226
+ q_extend.stride(1),
227
+ k_extend.stride(0),
228
+ k_extend.stride(1),
229
+ v_extend.stride(0),
230
+ v_extend.stride(1),
231
+ o_extend.stride(0),
232
+ o_extend.stride(1),
233
+ k_buffer.stride(0),
234
+ k_buffer.stride(1),
235
+ v_buffer.stride(0),
236
+ v_buffer.stride(1),
237
+ req_to_tokens.stride(0),
238
+ )
239
+ return
240
+
196
241
  _fwd_kernel[grid](
197
242
  q_extend,
198
243
  k_extend,
@@ -226,6 +271,7 @@ def extend_attention_fwd(
226
271
  num_warps=num_warps,
227
272
  num_stages=num_stages,
228
273
  )
274
+ cached_kernel = wrap_kernel_launcher(_fwd_kernel)
229
275
 
230
276
 
231
277
  def redundant_attention(
@@ -5,6 +5,7 @@ import time
5
5
  from concurrent.futures import ThreadPoolExecutor
6
6
  from enum import Enum, auto
7
7
  from typing import Dict, List, Optional, Tuple, Union
8
+ import warnings
8
9
 
9
10
  import numpy as np
10
11
  import rpyc
@@ -164,7 +165,7 @@ class ModelRpcServer(rpyc.Service):
164
165
  + self.tree_cache.evictable_size()
165
166
  )
166
167
  if available_size != self.max_total_num_token:
167
- logger.warning(
168
+ warnings.warn(
168
169
  "Warning: "
169
170
  f"available_size={available_size}, max_total_num_token={self.max_total_num_token}\n"
170
171
  "KV cache pool leak detected!"
sglang/srt/utils.py CHANGED
@@ -209,7 +209,7 @@ def load_image(image_file):
209
209
  elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
210
210
  image = Image.open(image_file)
211
211
  elif image_file.startswith("data:"):
212
- image_file = image_url.split(",")[1]
212
+ image_file = image_file.split(",")[1]
213
213
  image = Image.open(BytesIO(base64.b64decode(image_file)))
214
214
  else:
215
215
  image = Image.open(BytesIO(base64.b64decode(image_file)))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -267,10 +267,20 @@ pip install --upgrade pip
267
267
  pip install -e "python[all]"
268
268
  ```
269
269
 
270
+ ### Notes
271
+ - If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
272
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
273
+
270
274
  ## Quick Start
271
275
  The example below shows how to use sglang to answer a mulit-turn question.
272
276
 
273
277
  ### Using OpenAI Models
278
+ Set the OpenAI API Key
279
+ ```
280
+ export OPENAI_API_KEY=sk-xxxxxx
281
+ ```
282
+
283
+ Then, answer a multi-turn question.
274
284
  ```python
275
285
  from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
276
286
 
@@ -334,7 +344,7 @@ To begin with, import sglang.
334
344
  import sglang as sgl
335
345
  ```
336
346
 
337
- `sglang` provides some simple primitives such as `gen`, `select`, `fork`.
347
+ `sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
338
348
  You can implement your prompt flow in a function decorated by `sgl.function`.
339
349
  You can then invoke the function with `run` or `run_batch`.
340
350
  The system will manage the state, chat template, and parallelism for you.
@@ -382,10 +392,10 @@ def image_qa(s, image_file, question):
382
392
 
383
393
  ### Constrained Decoding
384
394
  ```python
385
- @function
395
+ @sgl.function
386
396
  def regular_expression_gen(s):
387
397
  s += "Q: What is the IP address of the Google DNS servers?\n"
388
- s += "A: " + gen(
398
+ s += "A: " + sgl.gen(
389
399
  "answer",
390
400
  temperature=0,
391
401
  regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
@@ -426,7 +436,7 @@ for out in state.text_iter():
426
436
  ## Backend: SGLang Runtime (SRT)
427
437
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
428
438
  However, it can also be used as a standalone API server.
429
- In this case, the RadixAttention can still greatly accelerate many use cases.
439
+ In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
430
440
 
431
441
  ### Usage
432
442
  Launch a server
@@ -450,6 +460,10 @@ curl http://localhost:30000/v1/completions \
450
460
  ```
451
461
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
452
462
  ```
463
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
464
+ ```
465
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
466
+ ```
453
467
 
454
468
  ### Supported Models
455
469
  - Llama
@@ -466,7 +480,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
466
480
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
467
481
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
468
482
 
469
- Learn more [here]().
483
+ Learn more [here](docs/benchmark_results.md).
470
484
 
471
485
  ## Roadmap
472
486
  - [ ] Function call
@@ -1,4 +1,4 @@
1
- sglang/__init__.py,sha256=U_vIUJQoAKKm3mK9wNlAiUFO4rk5G0epSNmOO43IQrI,95
1
+ sglang/__init__.py,sha256=lfYPLrb_Fy-J-l7NGMhnRDk0hlvAkCIzJEjzN6AsV0g,95
2
2
  sglang/api.py,sha256=tJuEyB28BUQfl0-dQr4vi6UMHBhUbmyu9Z3iAE5xFcU,3883
3
3
  sglang/flush_cache.py,sha256=cCD_MTlQ5qEv__w0nOthDnVitdAfyscYjksBljwC5Mw,1835
4
4
  sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
@@ -24,13 +24,13 @@ sglang/srt/model_config.py,sha256=R7YaR8H8AmCJl_1XcSP0zII_5ebZNl0wMXNVANGWd2c,99
24
24
  sglang/srt/sampling_params.py,sha256=Sd9l_uIIuS_mhbzljKwTGDO9ESMviNOYGxOifc71RrY,2895
25
25
  sglang/srt/server.py,sha256=XxTS1K4N5y-ZknLBQefxk1UxC50l6DABVqJOrJ-NG74,6388
26
26
  sglang/srt/server_args.py,sha256=Fpj3To5hEgmWn9qCS-pfypOEh34x9xVmiHBoEx5Smbo,4932
27
- sglang/srt/utils.py,sha256=YtTLEtVnOTrjub0Ct_xjrKGtHIajiQ57FB38l6Dw3a4,5691
27
+ sglang/srt/utils.py,sha256=-2F99bqYT99x1jScMjciJxgQec6CaH6PcCHSmrKHhhY,5692
28
28
  sglang/srt/constrained/fsm.py,sha256=H4kXSsV4IX2ow5TMmnmd-8ho4qqJ5mpVZ4MOH5FUtnY,12900
29
29
  sglang/srt/constrained/fsm_cache.py,sha256=KX4bFX5hj0W66SC9pSvst1ew7etaOMTtTC75z0enRME,1087
30
30
  sglang/srt/constrained/regex.py,sha256=CcV7KBOKS2ZxGoEr6BHG5okagNIGEXYvGvhKXu5gtDA,18689
31
31
  sglang/srt/constrained/tokenizer.py,sha256=rei9yKHFETcbDPOpI7bpIYdrBFgIBhGr_U-zb3r5Beo,7951
32
- sglang/srt/layers/context_flashattention_nopad.py,sha256=qQc35BVOYPoZlLbbTUWB3a43Zwd3v5ZKR_uFRoypUIU,5084
33
- sglang/srt/layers/extend_attention.py,sha256=X-3nrQBeUyA3_cp2vZH1dC85x-EF9rppiK95FocMnKA,11423
32
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=yTQBOo-kKKu5F-YFBo26lCWKtyaGae1M5gsn2GZpfAE,5205
33
+ sglang/srt/layers/extend_attention.py,sha256=nERsTpimVwdF-gHXmjy3D7zbSb4RrbVswmlzuA2NpWA,12559
34
34
  sglang/srt/layers/get_selected_logprob.py,sha256=CpMXM9WXMSB-AVaxBB_aVl1Qx_ZtAFFnjDTm4CgNDpU,2199
35
35
  sglang/srt/layers/logits_processor.py,sha256=rwcXwdZ7-dW9zvJX3MF_EHSxMLbU7TIQ9xUIYRu-WAs,3013
36
36
  sglang/srt/layers/radix_attention.py,sha256=hmPNFg2TkN4EAVUj376N_89RRtUYRwFgUpjj5SydnRk,6170
@@ -41,7 +41,7 @@ sglang/srt/managers/openai_protocol.py,sha256=Eid_734Wup4jsL1ZS2Op0vwRuzvNbF4mV2
41
41
  sglang/srt/managers/tokenizer_manager.py,sha256=jVwr0lM18RFJLhDb5TWlUpQ4Q8tALT4L6GY0jmaZkLw,7861
42
42
  sglang/srt/managers/router/infer_batch.py,sha256=UfS1uVhGnM-62Xv1cfu_IoTeIUxkjkKc4W3trtGbadc,11541
43
43
  sglang/srt/managers/router/manager.py,sha256=H-T-LlnIssHw-FXMHbs3yDQewkTMBCqG6jTYjugopCA,2527
44
- sglang/srt/managers/router/model_rpc.py,sha256=ZLK5izxMGpfCs4uT7DJ8u-aww5UG_jwjr7eJdbWGZ3Y,19271
44
+ sglang/srt/managers/router/model_rpc.py,sha256=G7NvEDguSNj-ZAXBo7GpNQJJHW5WAy_1-qQ7bzqltTU,19286
45
45
  sglang/srt/managers/router/model_runner.py,sha256=U-SBnEeLvwolLcaxyxrPgVG7PnR2rRvuXWV50t9y0Fo,16480
46
46
  sglang/srt/managers/router/radix_cache.py,sha256=ZQPm9HhQ7vD3Gl5nhuvw3ZW4ZRARcplqWed1GYUvHCg,6441
47
47
  sglang/srt/managers/router/scheduler.py,sha256=ejuIRwqqMZVXFKUionRJxy5AtNvK25YoGRO9rFY-rc8,2926
@@ -50,8 +50,8 @@ sglang/srt/models/llava.py,sha256=COS0IC6Yo-QiwKe5emgCbtEe9HgaSu5tt6CQA7UtV38,85
50
50
  sglang/srt/models/mixtral.py,sha256=j91xOt6NZ5tJiyTPqmUSzgJqFAw7vTDnfBtEs5x0jDM,13714
51
51
  sglang/test/test_programs.py,sha256=ua3wufnS3x6d_U3aboY4ivqoglrRPZj18j96vuiUtiE,11348
52
52
  sglang/test/test_utils.py,sha256=Knxg3BTA6d_7XSlprbBCdvfDr2SN5x7LhkT-tZFk5EQ,4828
53
- sglang-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
54
- sglang-0.1.3.dist-info/METADATA,sha256=SSRJ09MVErF7DrD5lJLm2oBDkk7sySET3AVaxJMciKs,21885
55
- sglang-0.1.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
56
- sglang-0.1.3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
57
- sglang-0.1.3.dist-info/RECORD,,
53
+ sglang-0.1.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
54
+ sglang-0.1.4.dist-info/METADATA,sha256=qYnmtS2k2ddncYRUOCZoE_SXEli5cFD7yh_JkP7IVWk,22676
55
+ sglang-0.1.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
56
+ sglang-0.1.4.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
57
+ sglang-0.1.4.dist-info/RECORD,,
File without changes