PyPI - sglang - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

sglang 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

sglang/__init__.py +1 -1
sglang/srt/layers/context_flashattention_nopad.py +8 -1
sglang/srt/layers/extend_attention.py +47 -1
sglang/srt/managers/router/model_rpc.py +2 -1
sglang/srt/utils.py +1 -1
{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/METADATA +20 -6
{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/RECORD +10 -10
{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/LICENSE +0 -0
{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/WHEEL +0 -0
{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.3"
+__version__ = "0.1.4"
 from sglang.api import *
 from sglang.global_config import global_config

sglang/srt/layers/context_flashattention_nopad.py CHANGED Viewed

@@ -6,6 +6,9 @@ import triton.language as tl
 from sglang.srt.utils import wrap_kernel_launcher
+CUDA_CAPABILITY = torch.cuda.get_device_capability()
 @triton.jit
 def _fwd_kernel(
     Q,
@@ -120,7 +123,11 @@ cached_kernel = None
 def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
-    BLOCK = 128
+    if CUDA_CAPABILITY[0] >= 8:
+        BLOCK = 128
+    else:
+        BLOCK = 64
     Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
     assert Lq == Lk and Lk == Lv
     assert Lk in {16, 32, 64, 128}

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -2,6 +2,10 @@ import torch
 import triton
 import triton.language as tl
 from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
+from sglang.srt.utils import wrap_kernel_launcher
+CUDA_CAPABILITY = torch.cuda.get_device_capability()
 @triton.jit
@@ -153,6 +157,9 @@ def _fwd_kernel(
     tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
+cached_kernel = None
 def extend_attention_fwd(
     q_extend,
     k_extend,
@@ -175,7 +182,11 @@ def extend_attention_fwd(
     k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
     """
-    BLOCK_M, BLOCK_N = 128, 128
+    if CUDA_CAPABILITY[0] >= 8:
+        BLOCK_M, BLOCK_N = 128, 128
+    else:
+        BLOCK_M, BLOCK_N = 64, 64
     Lq, Lk, Lv, Lo = (
         q_extend.shape[-1],
         k_extend.shape[-1],
@@ -193,6 +204,40 @@ def extend_attention_fwd(
     num_warps = 4 if Lk <= 64 else 8
     num_stages = 1
+    global cached_kernel
+    if cached_kernel:
+        cached_kernel(
+            grid,
+            num_warps,
+            q_extend,
+            k_extend,
+            v_extend,
+            o_extend,
+            k_buffer,
+            v_buffer,
+            req_to_tokens,
+            b_req_idx,
+            b_seq_len,
+            b_start_loc_extend,
+            b_seq_len_extend,
+            sm_scale,
+            kv_group_num,
+            q_extend.stride(0),
+            q_extend.stride(1),
+            k_extend.stride(0),
+            k_extend.stride(1),
+            v_extend.stride(0),
+            v_extend.stride(1),
+            o_extend.stride(0),
+            o_extend.stride(1),
+            k_buffer.stride(0),
+            k_buffer.stride(1),
+            v_buffer.stride(0),
+            v_buffer.stride(1),
+            req_to_tokens.stride(0),
+        )
+        return
     _fwd_kernel[grid](
         q_extend,
         k_extend,
@@ -226,6 +271,7 @@ def extend_attention_fwd(
         num_warps=num_warps,
         num_stages=num_stages,
     )
+    cached_kernel = wrap_kernel_launcher(_fwd_kernel)
 def redundant_attention(

sglang/srt/managers/router/model_rpc.py CHANGED Viewed

@@ -5,6 +5,7 @@ import time
 from concurrent.futures import ThreadPoolExecutor
 from enum import Enum, auto
 from typing import Dict, List, Optional, Tuple, Union
+import warnings
 import numpy as np
 import rpyc
@@ -164,7 +165,7 @@ class ModelRpcServer(rpyc.Service):
                     + self.tree_cache.evictable_size()
                 )
                 if available_size != self.max_total_num_token:
-                    logger.warning(
+                    warnings.warn(
                         "Warning: "
                         f"available_size={available_size}, max_total_num_token={self.max_total_num_token}\n"
                         "KV cache pool leak detected!"

sglang/srt/utils.py CHANGED Viewed

@@ -209,7 +209,7 @@ def load_image(image_file):
     elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
         image = Image.open(image_file)
     elif image_file.startswith("data:"):
-        image_file = image_url.split(",")[1]
+        image_file = image_file.split(",")[1]
         image = Image.open(BytesIO(base64.b64decode(image_file)))
     else:
         image = Image.open(BytesIO(base64.b64decode(image_file)))

{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.1.3
+Version: 0.1.4
 Summary: A structured generation langauge for LLMs.
 License: Apache License
                                    Version 2.0, January 2004
@@ -267,10 +267,20 @@ pip install --upgrade pip
 pip install -e "python[all]"
 ```
+### Notes
+- If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
+- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
 ## Quick Start
 The example below shows how to use sglang to answer a mulit-turn question.
 ### Using OpenAI Models
+Set the OpenAI API Key
+```
+export OPENAI_API_KEY=sk-xxxxxx
+```
+Then, answer a multi-turn question.
 ```python
 from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
@@ -334,7 +344,7 @@ To begin with, import sglang.
 import sglang as sgl
 ```
-`sglang` provides some simple primitives such as `gen`, `select`, `fork`.
+`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
 You can implement your prompt flow in a function decorated by `sgl.function`.
 You can then invoke the function with `run` or `run_batch`.
 The system will manage the state, chat template, and parallelism for you.
@@ -382,10 +392,10 @@ def image_qa(s, image_file, question):
 ### Constrained Decoding
 ```python
-@function
+@sgl.function
 def regular_expression_gen(s):
     s += "Q: What is the IP address of the Google DNS servers?\n"
-    s += "A: " + gen(
+    s += "A: " + sgl.gen(
         "answer",
         temperature=0,
         regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
@@ -426,7 +436,7 @@ for out in state.text_iter():
 ## Backend: SGLang Runtime (SRT)
 The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
 However, it can also be used as a standalone API server.
-In this case, the RadixAttention can still greatly accelerate many use cases.
+In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
 ### Usage
 Launch a server
@@ -450,6 +460,10 @@ curl http://localhost:30000/v1/completions \
 ```
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
 ```
+- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
+```
 ### Supported Models
 - Llama
@@ -466,7 +480,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
 - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
 ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
-Learn more [here]().
+Learn more [here](docs/benchmark_results.md).
 ## Roadmap
 - [ ] Function call

{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-sglang/__init__.py,sha256=U_vIUJQoAKKm3mK9wNlAiUFO4rk5G0epSNmOO43IQrI,95
+sglang/__init__.py,sha256=lfYPLrb_Fy-J-l7NGMhnRDk0hlvAkCIzJEjzN6AsV0g,95
 sglang/api.py,sha256=tJuEyB28BUQfl0-dQr4vi6UMHBhUbmyu9Z3iAE5xFcU,3883
 sglang/flush_cache.py,sha256=cCD_MTlQ5qEv__w0nOthDnVitdAfyscYjksBljwC5Mw,1835
 sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
@@ -24,13 +24,13 @@ sglang/srt/model_config.py,sha256=R7YaR8H8AmCJl_1XcSP0zII_5ebZNl0wMXNVANGWd2c,99
 sglang/srt/sampling_params.py,sha256=Sd9l_uIIuS_mhbzljKwTGDO9ESMviNOYGxOifc71RrY,2895
 sglang/srt/server.py,sha256=XxTS1K4N5y-ZknLBQefxk1UxC50l6DABVqJOrJ-NG74,6388
 sglang/srt/server_args.py,sha256=Fpj3To5hEgmWn9qCS-pfypOEh34x9xVmiHBoEx5Smbo,4932
-sglang/srt/utils.py,sha256=YtTLEtVnOTrjub0Ct_xjrKGtHIajiQ57FB38l6Dw3a4,5691
+sglang/srt/utils.py,sha256=-2F99bqYT99x1jScMjciJxgQec6CaH6PcCHSmrKHhhY,5692
 sglang/srt/constrained/fsm.py,sha256=H4kXSsV4IX2ow5TMmnmd-8ho4qqJ5mpVZ4MOH5FUtnY,12900
 sglang/srt/constrained/fsm_cache.py,sha256=KX4bFX5hj0W66SC9pSvst1ew7etaOMTtTC75z0enRME,1087
 sglang/srt/constrained/regex.py,sha256=CcV7KBOKS2ZxGoEr6BHG5okagNIGEXYvGvhKXu5gtDA,18689
 sglang/srt/constrained/tokenizer.py,sha256=rei9yKHFETcbDPOpI7bpIYdrBFgIBhGr_U-zb3r5Beo,7951
-sglang/srt/layers/context_flashattention_nopad.py,sha256=qQc35BVOYPoZlLbbTUWB3a43Zwd3v5ZKR_uFRoypUIU,5084
-sglang/srt/layers/extend_attention.py,sha256=X-3nrQBeUyA3_cp2vZH1dC85x-EF9rppiK95FocMnKA,11423
+sglang/srt/layers/context_flashattention_nopad.py,sha256=yTQBOo-kKKu5F-YFBo26lCWKtyaGae1M5gsn2GZpfAE,5205
+sglang/srt/layers/extend_attention.py,sha256=nERsTpimVwdF-gHXmjy3D7zbSb4RrbVswmlzuA2NpWA,12559
 sglang/srt/layers/get_selected_logprob.py,sha256=CpMXM9WXMSB-AVaxBB_aVl1Qx_ZtAFFnjDTm4CgNDpU,2199
 sglang/srt/layers/logits_processor.py,sha256=rwcXwdZ7-dW9zvJX3MF_EHSxMLbU7TIQ9xUIYRu-WAs,3013
 sglang/srt/layers/radix_attention.py,sha256=hmPNFg2TkN4EAVUj376N_89RRtUYRwFgUpjj5SydnRk,6170
@@ -41,7 +41,7 @@ sglang/srt/managers/openai_protocol.py,sha256=Eid_734Wup4jsL1ZS2Op0vwRuzvNbF4mV2
 sglang/srt/managers/tokenizer_manager.py,sha256=jVwr0lM18RFJLhDb5TWlUpQ4Q8tALT4L6GY0jmaZkLw,7861
 sglang/srt/managers/router/infer_batch.py,sha256=UfS1uVhGnM-62Xv1cfu_IoTeIUxkjkKc4W3trtGbadc,11541
 sglang/srt/managers/router/manager.py,sha256=H-T-LlnIssHw-FXMHbs3yDQewkTMBCqG6jTYjugopCA,2527
-sglang/srt/managers/router/model_rpc.py,sha256=ZLK5izxMGpfCs4uT7DJ8u-aww5UG_jwjr7eJdbWGZ3Y,19271
+sglang/srt/managers/router/model_rpc.py,sha256=G7NvEDguSNj-ZAXBo7GpNQJJHW5WAy_1-qQ7bzqltTU,19286
 sglang/srt/managers/router/model_runner.py,sha256=U-SBnEeLvwolLcaxyxrPgVG7PnR2rRvuXWV50t9y0Fo,16480
 sglang/srt/managers/router/radix_cache.py,sha256=ZQPm9HhQ7vD3Gl5nhuvw3ZW4ZRARcplqWed1GYUvHCg,6441
 sglang/srt/managers/router/scheduler.py,sha256=ejuIRwqqMZVXFKUionRJxy5AtNvK25YoGRO9rFY-rc8,2926
@@ -50,8 +50,8 @@ sglang/srt/models/llava.py,sha256=COS0IC6Yo-QiwKe5emgCbtEe9HgaSu5tt6CQA7UtV38,85
 sglang/srt/models/mixtral.py,sha256=j91xOt6NZ5tJiyTPqmUSzgJqFAw7vTDnfBtEs5x0jDM,13714
 sglang/test/test_programs.py,sha256=ua3wufnS3x6d_U3aboY4ivqoglrRPZj18j96vuiUtiE,11348
 sglang/test/test_utils.py,sha256=Knxg3BTA6d_7XSlprbBCdvfDr2SN5x7LhkT-tZFk5EQ,4828
-sglang-0.1.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sglang-0.1.3.dist-info/METADATA,sha256=SSRJ09MVErF7DrD5lJLm2oBDkk7sySET3AVaxJMciKs,21885
-sglang-0.1.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-sglang-0.1.3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
-sglang-0.1.3.dist-info/RECORD,,
+sglang-0.1.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sglang-0.1.4.dist-info/METADATA,sha256=qYnmtS2k2ddncYRUOCZoE_SXEli5cFD7yh_JkP7IVWk,22676
+sglang-0.1.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+sglang-0.1.4.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
+sglang-0.1.4.dist-info/RECORD,,

{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

sglang 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

sglang 0.1.3py3-none-any.whl → 0.1.4py3-none-any.whl