PyPI - sglang - Versions diffs - 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl - Mend

sglang 0.1.22py3-none-any.whl → 0.1.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

sglang/__init__.py +1 -1
sglang/bench_serving.py +243 -25
sglang/global_config.py +3 -2
sglang/lang/interpreter.py +1 -0
sglang/srt/hf_transformers_utils.py +13 -1
sglang/srt/layers/logits_processor.py +4 -5
sglang/srt/layers/radix_attention.py +38 -49
sglang/srt/managers/controller/cuda_graph_runner.py +58 -16
sglang/srt/managers/controller/infer_batch.py +51 -22
sglang/srt/managers/controller/model_runner.py +7 -4
sglang/srt/managers/controller/schedule_heuristic.py +8 -3
sglang/srt/managers/controller/tp_worker.py +9 -11
sglang/srt/memory_pool.py +13 -5
sglang/srt/models/deepseek.py +430 -0
sglang/srt/models/gpt_bigcode.py +282 -0
sglang/srt/models/llama2.py +19 -10
sglang/srt/server.py +20 -1
sglang/srt/server_args.py +12 -6
sglang/srt/utils.py +49 -0
{sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/METADATA +9 -5
{sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/RECORD +24 -22
{sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/WHEEL +1 -1
{sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/LICENSE +0 -0
{sglang-0.1.22.dist-info → sglang-0.1.24.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.22"
+__version__ = "0.1.24"
 # SGL API Components
 from sglang.api import (

sglang/bench_serving.py CHANGED Viewed

@@ -5,6 +5,9 @@ Benchmark online serving.
 Usage:
 python3 -m sglang.bench_serving --backend sglang --num-prompt 10
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
 """
 import argparse
@@ -19,6 +22,7 @@ import traceback
 import warnings
 from argparse import ArgumentParser as FlexibleArgumentParser
 from dataclasses import dataclass, field
+from datetime import datetime
 from typing import AsyncGenerator, List, Optional, Tuple, Union
 import aiohttp
@@ -53,12 +57,80 @@ class RequestFuncOutput:
     itl: List[float] = field(default_factory=list)  # List of inter-token latencies
     prompt_len: int = 0
     error: str = ""
+    output_len: int = 0
 def remove_prefix(text: str, prefix: str) -> str:
     return text[len(prefix) :] if text.startswith(prefix) else text
+# trt llm not support ignore_eos
+# https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.000001,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+            "min_length": request_func_input.output_len,
+            "end_id": 1048576,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data:")
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+                        most_recent_timestamp = timestamp
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+        if pbar:
+            pbar.update(1)
+        return output
 # set ignore_eos True by default
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
@@ -76,7 +148,7 @@ async def async_request_openai_completions(
             "temperature": 0.0,
             "best_of": 1,
             "max_tokens": request_func_input.output_len,
-            "stream": True,
+            "stream": not args.disable_stream,
             "ignore_eos": True,
         }
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
@@ -99,8 +171,9 @@ async def async_request_openai_completions(
                             continue
                         chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
                         if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
+                            pass
                         else:
                             data = json.loads(chunk)
@@ -123,6 +196,7 @@ async def async_request_openai_completions(
                     output.generated_text = generated_text
                     output.success = True
                     output.latency = latency
+                    output.output_len = request_func_input.output_len
                 else:
                     output.error = response.reason or ""
                     output.success = False
@@ -167,6 +241,7 @@ ASYNC_REQUEST_FUNCS = {
     "sglang": async_request_openai_completions,
     "vllm": async_request_openai_completions,
     "lmdeploy": async_request_openai_completions,
+    "trt": async_request_trt_llm,
 }
@@ -175,9 +250,11 @@ class BenchmarkMetrics:
     completed: int
     total_input: int
     total_output: int
+    total_output_retokenized: int
     request_throughput: float
     input_throughput: float
     output_throughput: float
+    output_throughput_retokenized: float
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
@@ -190,6 +267,8 @@ class BenchmarkMetrics:
     median_itl_ms: float
     std_itl_ms: float
     p99_itl_ms: float
+    mean_e2e_latency_ms: float
+    median_e2e_latency_ms: float
 default_sharegpt_path = "ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -384,31 +463,36 @@ def calculate_metrics(
     outputs: List[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
+    backend: str,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens: List[int] = []
+    output_lens: List[int] = []
+    retokenized_output_lens: List[int] = []
     total_input = 0
     completed = 0
     itls: List[float] = []
     tpots: List[float] = []
     ttfts: List[float] = []
+    e2e_latencies: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
-            # We use the tokenizer to count the number of output tokens for all
-            # serving backends instead of looking at len(outputs[i].itl) since
-            # multiple output tokens may be bundled together
-            # Note : this may inflate the output token count slightly
-            output_len = len(
+            output_len = outputs[i].output_len
+            output_lens.append(output_len)
+            retokenized_output_len = len(
                 tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
             )
-            actual_output_lens.append(output_len)
+            retokenized_output_lens.append(retokenized_output_len)
             total_input += input_requests[i][1]
             if output_len > 1:
                 tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
             itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
+            e2e_latencies.append(outputs[i].latency)
             completed += 1
         else:
-            actual_output_lens.append(0)
+            output_lens.append(0)
+            retokenized_output_lens.append(0)
     if completed == 0:
         warnings.warn(
@@ -419,10 +503,12 @@ def calculate_metrics(
     metrics = BenchmarkMetrics(
         completed=completed,
         total_input=total_input,
-        total_output=sum(actual_output_lens),
+        total_output=sum(output_lens),
+        total_output_retokenized=sum(retokenized_output_lens),
         request_throughput=completed / dur_s,
         input_throughput=total_input / dur_s,
-        output_throughput=sum(actual_output_lens) / dur_s,
+        output_throughput=sum(output_lens) / dur_s,
+        output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0)
         * 1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
@@ -436,9 +522,11 @@ def calculate_metrics(
         median_itl_ms=np.median(itls or 0) * 1000,
         std_itl_ms=np.std(itls or 0) * 1000,
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
+        mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
+        median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
     )
-    return metrics, actual_output_lens
+    return metrics, output_lens
 async def benchmark(
@@ -449,6 +537,7 @@ async def benchmark(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
     disable_tqdm: bool,
+    enable_multi: bool,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -498,19 +587,26 @@ async def benchmark(
     benchmark_duration = time.perf_counter() - benchmark_start_time
-    metrics, actual_output_lens = calculate_metrics(
+    metrics, output_lens = calculate_metrics(
         input_requests=input_requests,
         outputs=outputs,
         dur_s=benchmark_duration,
         tokenizer=tokenizer,
+        backend=backend,
     )
     print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Backend:", backend))
     print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10}".format(
+            "Total generated tokens (retokenized):", metrics.total_output_retokenized
+        )
+    )
     print(
         "{:<40} {:<10.2f}".format(
             "Request throughput (req/s):", metrics.request_throughput
@@ -526,6 +622,15 @@ async def benchmark(
             "Output token throughput (tok/s):", metrics.output_throughput
         )
     )
+    print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
+    print(
+        "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Median E2E Latency (ms):", metrics.median_e2e_latency_ms
+        )
+    )
     print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
     print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
     print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
@@ -542,11 +647,53 @@ async def benchmark(
     print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
     print("=" * 50)
+    if (
+        metrics.median_ttft_ms is not None
+        and metrics.mean_itl_ms is not None
+        and metrics.output_throughput is not None
+    ):
+        result = {
+            "backend": args.backend,
+            "dataset_name": args.dataset_name,
+            "request_rate": request_rate,
+            "total_input": metrics.total_input,
+            "total_output": metrics.total_output,
+            "total_output_retokenized": metrics.total_output_retokenized,
+            "mean_e2e_latency": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency": metrics.median_e2e_latency_ms,
+            "median_ttft": metrics.median_ttft_ms,
+            "median_itl": metrics.median_itl_ms,
+            "output_token_throughput": metrics.output_throughput,
+            "sharegpt_output_len": args.sharegpt_output_len,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            "benchmark_duration": benchmark_duration,
+        }
+    else:
+        print(f"Error running benchmark for request rate: {request_rate}")
+        print("-" * 30)
+    # Determine output file name
+    if args.output_file:
+        output_file_name = args.output_file
+    else:
+        now = datetime.now().strftime("%m%d")
+        if args.dataset_name == "random":
+            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
+        else:
+            output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
+    # Append results to a JSONL file
+    with open(output_file_name, "a") as file:
+        file.write(json.dumps(result) + "\n")
     result = {
         "duration": benchmark_duration,
         "completed": metrics.completed,
         "total_input_tokens": metrics.total_input,
         "total_output_tokens": metrics.total_output,
+        "total_output_tokens_retokenized": metrics.total_output_retokenized,
         "request_throughput": metrics.request_throughput,
         "input_throughput": metrics.input_throughput,
         "output_throughput": metrics.output_throughput,
@@ -563,15 +710,34 @@ async def benchmark(
         "std_itl_ms": metrics.std_itl_ms,
         "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
-        "output_lens": actual_output_lens,
+        "output_lens": output_lens,
         "ttfts": [output.ttft for output in outputs],
         "itls": [output.itl for output in outputs],
         "generated_texts": [output.generated_text for output in outputs],
         "errors": [output.error for output in outputs],
+        "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+        "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
     }
     return result
+def parse_request_rate_range(request_rate_range):
+    if len(request_rate_range.split(",")) == 3:
+        start, stop, step = map(int, request_rate_range.split(","))
+        return list(range(start, stop, step))
+    else:
+        return list(map(int, request_rate_range.split(",")))
+def check_chat_template(model_path):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        return "chat_template" in tokenizer.init_kwargs
+    except Exception as e:
+        print(f"Fail to load tokenizer config with error={e}")
+        return False
 def fire(args: argparse.Namespace):
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -581,6 +747,7 @@ def fire(args: argparse.Namespace):
             "sglang": 30000,
             "lmdeploy": 23333,
             "vllm": 8000,
+            "trt": 8000,
         }.get(args.backend, 30000)
     api_url = (
@@ -594,6 +761,16 @@ def fire(args: argparse.Namespace):
         else f"http://{args.host}:{args.port}/v1/models"
     )
+    if args.backend == "trt":
+        api_url = (
+            f"{args.base_url}/v2/models/ensemble/generate_stream"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v2/models/ensemble/generate_stream"
+        )
+        if args.model is None:
+            print("Please provide a model using `--model` when using `trt` backend.")
+            sys.exit(1)
     if args.model is None:
         try:
             response = requests.get(model_url)
@@ -610,6 +787,12 @@ def fire(args: argparse.Namespace):
         print("No model specified or found. Please provide a model using `--model`.")
         sys.exit(1)
+    if not check_chat_template(args.model):
+        print(
+            "\nWARNING It is recommended to use the `Chat` or `Instruct` model for benchmarking.\n"
+            "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
+        )
     print(f"{args}\n")
     backend = args.backend
@@ -637,17 +820,35 @@ def fire(args: argparse.Namespace):
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
-    asyncio.run(
-        benchmark(
-            backend=backend,
-            api_url=api_url,
-            model_id=model_id,
-            tokenizer=tokenizer,
-            input_requests=input_requests,
-            request_rate=args.request_rate,
-            disable_tqdm=args.disable_tqdm,
+    if args.multi:
+        request_rates = parse_request_rate_range(args.request_rate_range)
+        for rate in request_rates:
+            asyncio.run(
+                benchmark(
+                    backend=backend,
+                    api_url=api_url,
+                    model_id=model_id,
+                    tokenizer=tokenizer,
+                    input_requests=input_requests,
+                    request_rate=rate,
+                    disable_tqdm=args.disable_tqdm,
+                    enable_multi=args.multi,
+                )
+            )
+    else:
+        asyncio.run(
+            benchmark(
+                backend=backend,
+                api_url=api_url,
+                model_id=model_id,
+                tokenizer=tokenizer,
+                input_requests=input_requests,
+                request_rate=args.request_rate,
+                disable_tqdm=args.disable_tqdm,
+                enable_multi=args.multi,
+            )
         )
-    )
 # to avoid relying on SGLang's components
@@ -751,6 +952,23 @@ if __name__ == "__main__":
         action="store_true",
         help="Specify to disable tqdm progress bar.",
     )
+    parser.add_argument(
+        "--multi",
+        action="store_true",
+        help="Use request rate range rather than single value.",
+    )
+    parser.add_argument(
+        "--request-rate-range",
+        type=str,
+        default="2,34,2",
+        help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
+    )
+    parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument(
+        "--disable-stream",
+        action="store_true",
+        help="Disable streaming mode.",
+    )
     set_ulimit()

sglang/global_config.py CHANGED Viewed

@@ -16,9 +16,9 @@ class GlobalConfig:
         self.wait_for_new_request_delay = 0.0006
         # Runtime constants: New generation token ratio estimation
-        self.base_new_token_ratio = 0.4
+        self.init_new_token_ratio = 0.7
         self.base_min_new_token_ratio = 0.2
-        self.new_token_ratio_decay = 0.0001
+        self.new_token_ratio_decay = 0.001
         self.new_token_ratio_recovery = 0.05
         # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
@@ -27,6 +27,7 @@ class GlobalConfig:
         # Runtime constants: others
         self.num_continue_decode_steps = 10
+        self.retract_decode_steps = 20
         self.flashinfer_workspace_size = 192 * 1024 * 1024
         # Output tokenization configs

sglang/lang/interpreter.py CHANGED Viewed

@@ -288,6 +288,7 @@ class StreamExecutor:
             exes[i].text_ = str(self.text_)
             exes[i].messages_ = list(self.messages_)
             exes[i].cur_role = self.cur_role
+            exes[i].cur_role_begin_pos = self.cur_role_begin_pos
             exes[i].fork_start_text_pos = len(self.text_)
             exes[i].images_ = list(self.images_)

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -4,19 +4,26 @@ import functools
 import json
 import os
 import warnings
-from typing import AbstractSet, Collection, Literal, Optional, Union
+from typing import AbstractSet, Collection, Dict, Literal, Optional, Type, Union
 from huggingface_hub import snapshot_download
 from transformers import (
     AutoConfig,
     AutoProcessor,
     AutoTokenizer,
+    PretrainedConfig,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
 )
+from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
 from sglang.srt.utils import is_multimodal_model
+_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
+    ChatGLMConfig.model_type: ChatGLMConfig,
+    DbrxConfig.model_type: DbrxConfig,
+}
 def download_from_hf(model_path: str):
     if os.path.exists(model_path):
@@ -40,6 +47,9 @@ def get_config(
     config = AutoConfig.from_pretrained(
         model, trust_remote_code=trust_remote_code, revision=revision
     )
+    if config.model_type in _CONFIG_REGISTRY:
+        config_class = _CONFIG_REGISTRY[config.model_type]
+        config = config_class.from_pretrained(model, revision=revision)
     if model_overide_args:
         config.update(model_overide_args)
     return config
@@ -63,6 +73,8 @@ def get_context_length(config):
     rope_scaling = getattr(config, "rope_scaling", None)
     if rope_scaling:
         rope_scaling_factor = config.rope_scaling["factor"]
+        if config.rope_scaling["rope_type"] == "llama3":
+            rope_scaling_factor = 1
     else:
         rope_scaling_factor = 1

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -34,12 +34,11 @@ class LogitProcessorOutput:
 @dataclasses.dataclass
 class LogitsMetadata:
     forward_mode: ForwardMode
-    extend_seq_lens: torch.Tensor
-    extend_start_loc: torch.Tensor
-    # For logprobs
     return_logprob: bool
-    top_logprobs_nums: List[int]
+    extend_seq_lens: torch.Tensor = None
+    extend_start_loc: torch.Tensor = None
+    top_logprobs_nums: List[int] = None
     @classmethod
     def from_input_metadata(cls, input_metadata: InputMetadata):

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -85,32 +85,47 @@ class RadixAttention(nn.Module):
         return o
     def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
-        o1, s1 = input_metadata.flashinfer_prefill_wrapper_ragged.forward_return_lse(
-            q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
-            k.contiguous().view(-1, self.tp_k_head_num, self.head_dim),
-            v.contiguous().view(-1, self.tp_v_head_num, self.head_dim),
-            causal=True,
-            sm_scale=self.scaling,
-            logits_soft_cap=self.logit_cap,
-        )
+        if not input_metadata.use_ragged:
+            self.store_kv_cache(k, v, input_metadata)
-        if input_metadata.extend_no_prefix:
-            o = o1
-        else:
-            o2, s2 = input_metadata.flashinfer_prefill_wrapper_paged.forward_return_lse(
+            o = input_metadata.flashinfer_prefill_wrapper_paged.forward(
                 q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
-                input_metadata.token_to_kv_pool.kv_data[self.layer_id],
-                causal=False,
+                input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id),
+                causal=True,
                 sm_scale=self.scaling,
                 logits_soft_cap=self.logit_cap,
             )
+        else:
+            o1, s1 = (
+                input_metadata.flashinfer_prefill_wrapper_ragged.forward_return_lse(
+                    q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
+                    k.contiguous().view(-1, self.tp_k_head_num, self.head_dim),
+                    v.contiguous().view(-1, self.tp_v_head_num, self.head_dim),
+                    causal=True,
+                    sm_scale=self.scaling,
+                    logits_soft_cap=self.logit_cap,
+                )
+            )
-            o, _ = merge_state(o1, s1, o2, s2)
+            if input_metadata.extend_no_prefix:
+                o = o1
+            else:
+                o2, s2 = (
+                    input_metadata.flashinfer_prefill_wrapper_paged.forward_return_lse(
+                        q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
+                        input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id),
+                        causal=False,
+                        sm_scale=self.scaling,
+                        logits_soft_cap=self.logit_cap,
+                    )
+                )
-        self.store_kv_cache(k, v, input_metadata)
+                o, _ = merge_state(o1, s1, o2, s2)
+            self.store_kv_cache(k, v, input_metadata)
-        if input_metadata.total_num_tokens >= global_config.layer_sync_threshold:
-            torch.cuda.synchronize()
+            if input_metadata.total_num_tokens >= global_config.layer_sync_threshold:
+                torch.cuda.synchronize()
         return o.view(-1, self.tp_q_head_num * self.head_dim)
@@ -119,7 +134,7 @@ class RadixAttention(nn.Module):
         o = input_metadata.flashinfer_decode_wrapper.forward(
             q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
-            input_metadata.token_to_kv_pool.kv_data[self.layer_id],
+            input_metadata.token_to_kv_pool.get_kv_buffer(self.layer_id),
             sm_scale=self.scaling,
             logits_soft_cap=self.logit_cap,
         )
@@ -136,33 +151,7 @@ class RadixAttention(nn.Module):
             return self.decode_forward(q, k, v, input_metadata)
     def store_kv_cache(self, cache_k, cache_v, input_metadata: InputMetadata):
-        kv_cache = input_metadata.token_to_kv_pool.kv_data[self.layer_id]
-        _store_kv_cache(cache_k, cache_v, kv_cache, input_metadata.out_cache_loc)
-try:
-    @torch.library.custom_op("mylib::store_kv_cache", mutates_args={"kv_cache"})
-    def _store_kv_cache(
-        k: torch.Tensor,
-        v: torch.Tensor,
-        kv_cache: torch.Tensor,
-        cache_loc: torch.Tensor,
-    ) -> None:
-        kv_cache[cache_loc, 0] = k
-        kv_cache[cache_loc, 1] = v
-    @_store_kv_cache.register_fake
-    def _(k, v, kv_cache, cache_loc):
-        pass
-except:
-    def _store_kv_cache(
-        k: torch.Tensor,
-        v: torch.Tensor,
-        kv_cache: torch.Tensor,
-        cache_loc: torch.Tensor,
-    ) -> None:
-        kv_cache[cache_loc, 0] = k
-        kv_cache[cache_loc, 1] = v
+        k_cache = input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id)
+        v_cache = input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id)
+        k_cache[input_metadata.out_cache_loc] = cache_k
+        v_cache[input_metadata.out_cache_loc] = cache_v

sglang 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

sglang 0.1.22py3-none-any.whl → 0.1.24py3-none-any.whl