PyPI - sglang - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

sglang 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

sglang/__init__.py +1 -1
sglang/api.py +14 -0
sglang/backend/anthropic.py +18 -12
sglang/backend/base_backend.py +6 -0
sglang/backend/openai.py +41 -12
sglang/backend/runtime_endpoint.py +57 -6
sglang/lang/chat_template.py +47 -26
sglang/lang/interpreter.py +15 -2
sglang/lang/ir.py +1 -1
sglang/srt/constrained/__init__.py +23 -1
sglang/srt/constrained/fsm_cache.py +14 -3
sglang/srt/layers/context_flashattention_nopad.py +1 -1
sglang/srt/layers/extend_attention.py +7 -6
sglang/srt/layers/radix_attention.py +2 -10
sglang/srt/layers/token_attention.py +12 -4
sglang/srt/managers/io_struct.py +3 -1
sglang/srt/managers/router/infer_batch.py +6 -2
sglang/srt/managers/router/model_rpc.py +45 -32
sglang/srt/managers/router/model_runner.py +40 -25
sglang/srt/managers/tokenizer_manager.py +2 -0
sglang/srt/model_config.py +12 -5
sglang/srt/models/gemma.py +340 -0
sglang/srt/models/llama2.py +5 -5
sglang/srt/models/llava.py +2 -4
sglang/srt/models/mixtral.py +5 -5
sglang/srt/models/qwen.py +4 -4
sglang/srt/models/qwen2.py +5 -5
sglang/srt/models/stablelm.py +293 -0
sglang/srt/server.py +111 -47
sglang/srt/server_args.py +44 -9
sglang/srt/utils.py +1 -0
sglang/test/test_utils.py +1 -1
sglang/utils.py +15 -12
{sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/METADATA +16 -6
sglang-0.1.14.dist-info/RECORD +64 -0
{sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/WHEEL +1 -1
sglang/srt/models/gpt_neox.py +0 -274
sglang-0.1.12.dist-info/RECORD +0 -63
{sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/LICENSE +0 -0
{sglang-0.1.12.dist-info → sglang-0.1.14.dist-info}/top_level.txt +0 -0

sglang/srt/layers/token_attention.py CHANGED Viewed

@@ -4,8 +4,16 @@
 import torch
 import triton
 import triton.language as tl
+from sglang.srt.managers.router.model_runner import global_server_args_dict
 from sglang.srt.utils import wrap_kernel_launcher
+if global_server_args_dict.get("attention_reduce_in_fp32", False):
+    REDUCE_TRITON_TYPE = tl.float32
+    REDUCE_TORCH_TYPE = torch.float32
+else:
+    REDUCE_TRITON_TYPE = tl.float16
+    REDUCE_TORCH_TYPE = torch.float16
 @triton.jit
 def _fwd_kernel_stage1(
@@ -49,7 +57,7 @@ def _fwd_kernel_stage1(
     block_mask = tl.where(block_stard_index < cur_batch_seq_len, 1, 0)
     for start_mark in range(0, block_mask, 1):
-        q = tl.load(Q + off_q + start_mark)
+        q = tl.load(Q + off_q + start_mark).to(REDUCE_TRITON_TYPE)
         offs_n_new = cur_batch_start_index + offs_n
         k_loc = tl.load(
             Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
@@ -65,7 +73,7 @@ def _fwd_kernel_stage1(
             K_Buffer + offs_buf_k,
             mask=offs_n_new[:, None] < cur_batch_end_index,
             other=0.0,
-        )
+        ).to(REDUCE_TRITON_TYPE)
         att_value = tl.sum(q[None, :] * k, 1)
         att_value *= sm_scale
         off_o = cur_head * att_stride_h + (cur_batch_in_all_start_index + offs_n)
@@ -161,7 +169,7 @@ def _token_att_m_fwd(
     # shape constraints
     Lq, Lk = q.shape[-1], k_buffer.shape[-1]
     assert Lq == Lk
-    assert Lk in {16, 32, 64, 128}
+    assert Lk in {16, 32, 64, 128, 256}
     sm_scale = 1.0 / (Lk**0.5)
     batch, head_num = B_req_idx.shape[0], q.shape[1]
@@ -299,7 +307,7 @@ def token_attention_fwd(
 ):
     if att_m is None:
         att_m = torch.empty(
-            (q.shape[-2], total_num_tokens), dtype=q.dtype, device="cuda"
+            (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
         )
     _token_att_m_fwd(

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -15,10 +15,12 @@ class GenerateReqInput:
     sampling_params: Union[List[Dict], Dict] = None
     # The request id
     rid: Optional[Union[List[str], str]] = None
-    # Whether return logprobs of the prompts
+    # Whether to return logprobs
     return_logprob: Optional[Union[List[bool], bool]] = None
     # The start location of the prompt for return_logprob
     logprob_start_len: Optional[Union[List[int], int]] = None
+    # Whether to detokenize tokens in logprobs
+    return_text_in_logprobs: bool = False
     # Whether to stream output
     stream: bool = False

sglang/srt/managers/router/infer_batch.py CHANGED Viewed

@@ -27,8 +27,12 @@ class Req:
         self.input_ids = input_ids
         self.output_ids = []
-        # for accumulated prompt tokens from jump forward
-        self.orig_prompt_tokens = len(input_ids)
+        # Since jump forward may retokenize the prompt with partial outputs,
+        # we maintain the original prompt length to report the correct usage.
+        self.prompt_tokens = len(input_ids)
+        # The number of decoded tokens for token usage report. Note that
+        # this does not include the jump forward tokens.
+        self.completion_tokens_wo_jump_forward = 0
         # For vision input
         self.pixel_values = None

sglang/srt/managers/router/model_rpc.py CHANGED Viewed

@@ -46,7 +46,6 @@ class ModelRpcServer(rpyc.Service):
         server_args, port_args = [obtain(x) for x in [server_args, port_args]]
         # Copy arguments
-        self.model_mode = server_args.model_mode
         self.tp_rank = tp_rank
         self.tp_size = server_args.tp_size
         self.schedule_heuristic = server_args.schedule_heuristic
@@ -57,17 +56,26 @@ class ModelRpcServer(rpyc.Service):
         # Init model and tokenizer
         self.model_config = ModelConfig(
-            server_args.model_path, server_args.trust_remote_code
+            server_args.model_path,
+            server_args.trust_remote_code,
+            context_length=server_args.context_length,
         )
+        # for model end global settings
+        server_args_dict = {
+            "enable_flashinfer": server_args.enable_flashinfer,
+            "attention_reduce_in_fp32": server_args.attention_reduce_in_fp32,
+        }
         self.model_runner = ModelRunner(
-            self.model_config,
-            server_args.mem_fraction_static,
-            tp_rank,
-            server_args.tp_size,
-            port_args.nccl_port,
-            server_args.load_format,
-            server_args.trust_remote_code,
-            server_args.model_mode,
+            model_config=self.model_config,
+            mem_fraction_static=server_args.mem_fraction_static,
+            tp_rank=tp_rank,
+            tp_size=server_args.tp_size,
+            nccl_port=port_args.nccl_port,
+            load_format=server_args.load_format,
+            trust_remote_code=server_args.trust_remote_code,
+            server_args_dict=server_args_dict,
         )
         if is_multimodal_model(server_args.model_path):
             self.processor = get_processor(
@@ -102,11 +110,11 @@ class ModelRpcServer(rpyc.Service):
             f"max_total_num_token={self.max_total_num_token}, "
             f"max_prefill_num_token={self.max_prefill_num_token}, "
             f"context_len={self.model_config.context_len}, "
-            f"model_mode={self.model_mode}"
         )
+        logger.info(server_args.get_optional_modes_logging())
         # Init cache
-        self.tree_cache = RadixCache(disable="no-cache" in self.model_mode)
+        self.tree_cache = RadixCache(server_args.disable_radix_cache)
         self.tree_cache_metrics = {"total": 0, "hit": 0}
         self.scheduler = Scheduler(
             self.schedule_heuristic,
@@ -208,6 +216,19 @@ class ModelRpcServer(rpyc.Service):
                     if self.out_pyobjs and self.running_batch.reqs[0].stream:
                         break
+                    if self.running_batch is not None and self.tp_rank == 0:
+                        if self.decode_forward_ct % 40 == 0:
+                            num_used = self.max_total_num_token - (
+                                self.token_to_kv_pool.available_size()
+                                + self.tree_cache.evictable_size()
+                            )
+                            logger.info(
+                                f"#running-req: {len(self.running_batch.reqs)}, "
+                                f"#token: {num_used}, "
+                                f"token usage: {num_used / self.max_total_num_token:.2f}, "
+                                f"#queue-req: {len(self.forward_queue)}"
+                            )
             else:
                 # check the available size
                 available_size = (
@@ -221,19 +242,6 @@ class ModelRpcServer(rpyc.Service):
                         "KV cache pool leak detected!"
                     )
-        if self.running_batch is not None and self.tp_rank == 0:
-            if self.decode_forward_ct % 20 == 0:
-                num_used = self.max_total_num_token - (
-                    self.token_to_kv_pool.available_size()
-                    + self.tree_cache.evictable_size()
-                )
-                logger.info(
-                    f"#running-req: {len(self.running_batch.reqs)}, "
-                    f"#token: {num_used}, "
-                    f"token usage: {num_used / self.max_total_num_token:.2f}, "
-                    f"#queue-req: {len(self.forward_queue)}"
-                )
     def handle_generate_request(
         self,
         recv_req: TokenizedGenerateReqInput,
@@ -424,6 +432,7 @@ class ModelRpcServer(rpyc.Service):
         # Check finish condition
         pt = 0
         for i, req in enumerate(reqs):
+            req.completion_tokens_wo_jump_forward += 1
             req.output_ids = [next_token_ids[i]]
             req.check_finished()
@@ -431,9 +440,14 @@ class ModelRpcServer(rpyc.Service):
                 req.logprob = logprobs[pt : pt + req.extend_input_len - 1]
                 req.normalized_logprob = normalized_logprobs[i]
-                token_ids = req.input_ids + [next_token_ids[i]]
-                token_logprobs = [None] + req.logprob + [last_logprobs[i]]
+                # If logprob_start_len > 0, then first logprob_start_len prompt tokens
+                # will be ignored.
+                prompt_token_len = len(req.logprob)
+                token_ids = req.input_ids[-prompt_token_len:] + [next_token_ids[i]]
+                token_logprobs = req.logprob + [last_logprobs[i]]
                 req.token_logprob = list(zip(token_ids, token_logprobs))
+                if req.logprob_start_len == 0:
+                    req.token_logprob = [(req.input_ids[0], None)] + req.token_logprob
                 pt += req.extend_input_len
         self.handle_finished_requests(batch)
@@ -500,6 +514,7 @@ class ModelRpcServer(rpyc.Service):
         # Check finish condition
         for i, (req, next_tok_id) in enumerate(zip(reqs, next_token_ids)):
+            req.completion_tokens_wo_jump_forward += 1
             req.output_ids.append(next_tok_id)
             req.check_finished()
@@ -541,15 +556,13 @@ class ModelRpcServer(rpyc.Service):
                     req.sampling_params.skip_special_tokens
                 )
-                # For the length of input_ids, which will be accumulated during jump-forward.
-                # Use the original length of input_ids to calculate the token usage info.
                 meta_info = {
-                    "prompt_tokens": req.orig_prompt_tokens,
+                    "prompt_tokens": req.prompt_tokens,
                     "completion_tokens": len(req.input_ids)
                     + len(req.output_ids)
-                    - req.orig_prompt_tokens,
+                    - req.prompt_tokens,
+                    "completion_tokens_wo_jump_forward": req.completion_tokens_wo_jump_forward,
                 }
                 if req.return_logprob:
                     meta_info["prompt_logprob"] = req.logprob
                     meta_info["token_logprob"] = req.token_logprob

sglang/srt/managers/router/model_runner.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import importlib
 import logging
+import inspect
 from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
-from typing import List
+import importlib.resources
 import numpy as np
 import torch
@@ -13,27 +14,34 @@ from sglang.srt.utils import is_multimodal_model
 from sglang.utils import get_available_gpu_memory
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 from vllm.model_executor.model_loader import _set_default_torch_dtype
 from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel
+import importlib
+import pkgutil
 import sglang
-QUANTIONCONFIG_MAPPING = {"awq": AWQConfig, "gptq": GPTQConfig}
+QUANTIONCONFIG_MAPPING = {"awq": AWQConfig, "gptq": GPTQConfig, "marlin": MarlinConfig}
 logger = logging.getLogger("model_runner")
-# for model_mode
-global_model_mode: List[str] = []
+# for server args in model endpoints
+global_server_args_dict: dict = None
 @lru_cache()
 def import_model_classes():
     model_arch_name_to_cls = {}
-    for module_path in (Path(sglang.__file__).parent / "srt" / "models").glob("*.py"):
-        module = importlib.import_module(f"sglang.srt.models.{module_path.stem}")
-        if hasattr(module, "EntryClass"):
-            model_arch_name_to_cls[module.EntryClass.__name__] = module.EntryClass
+    package_name = "sglang.srt.models"
+    package = importlib.import_module(package_name)
+    for finder, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + '.'):
+        if not ispkg:
+            module = importlib.import_module(name)
+            if hasattr(module, "EntryClass"):
+                model_arch_name_to_cls[module.EntryClass.__name__] = module.EntryClass
     return model_arch_name_to_cls
@@ -81,7 +89,6 @@ class InputMetadata:
     return_logprob: bool = False
     # for flashinfer
-    use_flashinfer: bool = False
     qo_indptr: torch.Tensor = None
     kv_indptr: torch.Tensor = None
     kv_indices: torch.Tensor = None
@@ -126,14 +133,21 @@ class InputMetadata:
             self.prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
                 workspace_buffer, "NHD"
             )
-            self.prefill_wrapper.begin_forward(
+            args = [
                 self.qo_indptr,
                 self.kv_indptr,
                 self.kv_indices,
                 self.kv_last_page_len,
                 self.model_runner.model_config.num_attention_heads // tp_size,
                 self.model_runner.model_config.num_key_value_heads // tp_size,
-            )
+            ]
+            # flashinfer >= 0.0.3
+            # FIXME: Drop this when flashinfer updates to 0.0.4
+            if len(inspect.signature(self.prefill_wrapper.begin_forward).parameters) == 7:
+                args.append(self.model_runner.model_config.head_dim)
+            self.prefill_wrapper.begin_forward(*args)
         else:
             self.decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
                 workspace_buffer, "NHD"
@@ -224,8 +238,7 @@ class InputMetadata:
         if forward_mode == ForwardMode.EXTEND:
             ret.init_extend_args()
-        ret.use_flashinfer = "flashinfer" in model_runner.model_mode
-        if ret.use_flashinfer:
+        if global_server_args_dict.get("enable_flashinfer", False):
             ret.init_flashinfer_args(tp_size)
         return ret
@@ -241,7 +254,7 @@ class ModelRunner:
         nccl_port,
         load_format="auto",
         trust_remote_code=True,
-        model_mode: List[str] = (),
+        server_args_dict: dict = {},
     ):
         self.model_config = model_config
         self.mem_fraction_static = mem_fraction_static
@@ -250,10 +263,9 @@ class ModelRunner:
         self.nccl_port = nccl_port
         self.load_format = load_format
         self.trust_remote_code = trust_remote_code
-        self.model_mode = model_mode
-        global global_model_mode
-        global_model_mode = model_mode
+        global global_server_args_dict
+        global_server_args_dict = server_args_dict
         # Init torch distributed
         torch.cuda.set_device(self.tp_rank)
@@ -292,9 +304,15 @@ class ModelRunner:
                     self.model_config.hf_config, "quantization_config", None
                 )
                 if hf_quant_config is not None:
-                    quant_config_class = QUANTIONCONFIG_MAPPING.get(
-                        hf_quant_config["quant_method"]
-                    )
+                    hf_quant_method = hf_quant_config["quant_method"]
+                    # compat: autogptq uses is_marlin_format within quant config
+                    if (hf_quant_method == "gptq"
+                            and "is_marlin_format" in hf_quant_config
+                            and hf_quant_config["is_marlin_format"]):
+                        hf_quant_method = "marlin"
+                    quant_config_class = QUANTIONCONFIG_MAPPING.get(hf_quant_method)
                     if quant_config_class is None:
                         raise ValueError(
                             f"Unsupported quantization method: {hf_quant_config['quant_method']}"
@@ -319,9 +337,7 @@ class ModelRunner:
         available_gpu_memory = get_available_gpu_memory(
             self.tp_rank, distributed=self.tp_size > 1
         ) * (1 << 30)
-        head_dim = (
-            self.model_config.hidden_size // self.model_config.num_attention_heads
-        )
+        head_dim = self.model_config.head_dim
         head_num = self.model_config.num_key_value_heads // self.tp_size
         cell_size = head_num * head_dim * self.model_config.num_hidden_layers * 2 * 2
         rest_memory = available_gpu_memory - total_gpu_memory * (
@@ -346,8 +362,7 @@ class ModelRunner:
             self.max_total_num_token,
             dtype=torch.float16,
             head_num=self.model_config.num_key_value_heads // self.tp_size,
-            head_dim=self.model_config.hidden_size
-            // self.model_config.num_attention_heads,
+            head_dim=self.model_config.head_dim,
             layer_num=self.model_config.num_hidden_layers,
         )

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -82,6 +82,8 @@ class TokenizerManager:
         server_args: ServerArgs,
         port_args: PortArgs,
     ):
+        self.server_args = server_args
         context = zmq.asyncio.Context(2)
         self.recv_from_detokenizer = context.socket(zmq.PULL)
         self.recv_from_detokenizer.bind(f"tcp://127.0.0.1:{port_args.tokenizer_port}")

sglang/srt/model_config.py CHANGED Viewed

@@ -1,7 +1,5 @@
-import os
-from typing import Optional, Union
+from typing import Optional
-import torch
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
@@ -11,15 +9,24 @@ class ModelConfig:
         path: str,
         trust_remote_code: bool = True,
         revision: Optional[str] = None,
+        context_length: Optional[int] = None,
     ) -> None:
         self.path = path
         self.trust_remote_code = trust_remote_code
         self.revision = revision
         self.hf_config = get_config(self.path, trust_remote_code, revision)
+        if context_length is not None:
+            self.context_len = context_length
+        else:
+            self.context_len = get_context_length(self.hf_config)
         # Unify the config keys for hf_config
-        self.context_len = get_context_length(self.hf_config)
-        self.head_dim = self.hf_config.hidden_size // self.hf_config.num_attention_heads
+        self.head_dim = getattr(
+            self.hf_config,
+            "head_dim",
+            self.hf_config.hidden_size // self.hf_config.num_attention_heads,
+        )
         self.num_attention_heads = self.hf_config.num_attention_heads
         self.num_key_value_heads = getattr(self.hf_config, "num_key_value_heads", None)
         if self.num_key_value_heads is None:

sglang 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

sglang 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl