PyPI - sglang - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

sglang 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/api.py +13 -1
sglang/bench_latency.py +10 -5
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/global_config.py +1 -1
sglang/lang/backend/runtime_endpoint.py +60 -49
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +5 -2
sglang/lang/ir.py +22 -4
sglang/launch_server.py +8 -1
sglang/srt/constrained/jump_forward.py +13 -2
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +24 -2
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/extend_attention.py +3 -1
sglang/srt/layers/fused_moe/__init__.py +1 -0
sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
sglang/srt/layers/fused_moe/layer.py +587 -0
sglang/srt/layers/layernorm.py +3 -0
sglang/srt/layers/logits_processor.py +64 -27
sglang/srt/layers/radix_attention.py +41 -18
sglang/srt/layers/sampler.py +154 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +59 -179
sglang/srt/managers/tokenizer_manager.py +193 -84
sglang/srt/managers/tp_worker.py +131 -50
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +97 -28
sglang/srt/model_executor/forward_batch_info.py +188 -82
sglang/srt/model_executor/model_runner.py +269 -87
sglang/srt/models/chatglm.py +6 -14
sglang/srt/models/commandr.py +6 -2
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +7 -3
sglang/srt/models/deepseek_v2.py +12 -7
sglang/srt/models/gemma.py +6 -2
sglang/srt/models/gemma2.py +22 -8
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +66 -398
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +7 -3
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/minicpm.py +7 -3
sglang/srt/models/mixtral.py +61 -255
sglang/srt/models/mixtral_quant.py +6 -5
sglang/srt/models/qwen.py +7 -4
sglang/srt/models/qwen2.py +15 -5
sglang/srt/models/qwen2_moe.py +7 -16
sglang/srt/models/stablelm.py +6 -2
sglang/srt/openai_api/adapter.py +149 -58
sglang/srt/sampling/sampling_batch_info.py +209 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -4
sglang/srt/server.py +107 -71
sglang/srt/server_args.py +49 -15
sglang/srt/utils.py +27 -18
sglang/test/runners.py +38 -38
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_programs.py +32 -5
sglang/test/test_utils.py +37 -50
sglang/version.py +1 -1
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/METADATA +102 -27
sglang-0.2.14.dist-info/RECORD +114 -0
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang/srt/model_loader/model_loader.py +0 -292
sglang/srt/model_loader/utils.py +0 -275
sglang-0.2.12.dist-info/RECORD +0 -112
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
{sglang-0.2.12.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -22,10 +22,8 @@ import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
-import torch
 from sglang.srt.managers.schedule_batch import BaseFinishReason
-from sglang.srt.sampling_params import SamplingParams
+from sglang.srt.sampling.sampling_params import SamplingParams
 @dataclass
@@ -43,9 +41,9 @@ class GenerateReqInput:
     rid: Optional[Union[List[str], str]] = None
     # Whether to return logprobs.
     return_logprob: Optional[Union[List[bool], bool]] = None
-    # The start location of the prompt for return_logprob.
+    # If return logprobs, the start location in the prompt for returning logprobs.
     logprob_start_len: Optional[Union[List[int], int]] = None
-    # The number of top logprobs to return.
+    # If return logprobs, the number of top logprobs to return at each position.
     top_logprobs_num: Optional[Union[List[int], int]] = None
     # Whether to detokenize tokens in text in the returned logprobs.
     return_text_in_logprobs: bool = False
@@ -77,7 +75,7 @@ class GenerateReqInput:
             if self.return_logprob is None:
                 self.return_logprob = False
             if self.logprob_start_len is None:
-                self.logprob_start_len = 0
+                self.logprob_start_len = -1
             if self.top_logprobs_num is None:
                 self.top_logprobs_num = 0
         else:
@@ -143,7 +141,7 @@ class GenerateReqInput:
                 self.return_logprob = [self.return_logprob] * num
             if self.logprob_start_len is None:
-                self.logprob_start_len = [0] * num
+                self.logprob_start_len = [-1] * num
             elif not isinstance(self.logprob_start_len, list):
                 self.logprob_start_len = [self.logprob_start_len] * num
@@ -155,16 +153,27 @@ class GenerateReqInput:
 @dataclass
 class TokenizedGenerateReqInput:
+    # The request id
     rid: str
+    # The input text
     input_text: str
+    # The input token ids
     input_ids: List[int]
+    # The pixel values for input images
     pixel_values: List[float]
+    # The hash of input images
     image_hash: int
+    # The image size
     image_size: List[int]
+    # The sampling parameters
     sampling_params: SamplingParams
+    # Whether to return the logprobs
     return_logprob: bool
+    # If return logprobs, the start location in the prompt for returning logprobs.
     logprob_start_len: int
+    # If return logprobs, the number of top logprobs to return at each position.
     top_logprobs_num: int
+    # Whether to stream output
     stream: bool
@@ -215,15 +224,21 @@ class EmbeddingReqInput:
 @dataclass
 class TokenizedEmbeddingReqInput:
+    # The request id
     rid: str
+    # The input text
     input_text: str
+    # The input token ids
     input_ids: List[int]
+    # Dummy sampling params for compatibility
     sampling_params: SamplingParams
 @dataclass
 class BatchTokenIDOut:
+    # The request id
     rids: List[str]
+    # The version id to sync decode status with in detokenizer_manager
     vids: List[int]
     decoded_texts: List[str]
     decode_ids: List[int]
@@ -236,17 +251,25 @@ class BatchTokenIDOut:
 @dataclass
 class BatchStrOut:
+    # The request id
     rids: List[str]
+    # The output decoded strings
     output_strs: List[str]
+    # The meta info
     meta_info: List[Dict]
+    # The finish reason
     finished_reason: List[BaseFinishReason]
 @dataclass
 class BatchEmbeddingOut:
+    # The request id
     rids: List[str]
+    # The output embedding
     embeddings: List[List[float]]
+    # The meta info
     meta_info: List[Dict]
+    # The finish reason
     finished_reason: List[BaseFinishReason]
@@ -256,10 +279,20 @@ class FlushCacheReq:
 @dataclass
-class AbortReq:
-    rid: str
+class UpdateWeightReqInput:
+    # The model path with the new weights
+    model_path: str
+    # The format to load the weights
+    load_format: Optional[str] = None
 @dataclass
-class DetokenizeReqInput:
-    input_ids: List[int]
+class UpdateWeightReqOutput:
+    success: bool
+    message: str
+@dataclass
+class AbortReq:
+    # The request id
+    rid: str

sglang/srt/managers/policy_scheduler.py CHANGED Viewed

@@ -111,11 +111,14 @@ class PrefillAdder:
         rem_total_tokens: int,
         rem_input_tokens: int,
         rem_chunk_tokens: Optional[int],
+        mixed_with_decode_tokens: int = 0,
     ):
         self.tree_cache = tree_cache
-        self.rem_total_tokens = rem_total_tokens
-        self.rem_input_tokens = rem_input_tokens
+        self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
+        self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
         self.rem_chunk_tokens = rem_chunk_tokens
+        if self.rem_chunk_tokens is not None:
+            self.rem_chunk_tokens -= mixed_with_decode_tokens
         self.can_run_list = []
         self.new_inflight_req = None

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 """
 Copyright 2023-2024 SGLang Team
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,20 +18,22 @@ limitations under the License.
 """Meta data for requests and batches"""
 import logging
-import warnings
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 import torch
-from flashinfer.sampling import top_k_top_p_sampling_from_probs
-import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.global_config import global_config
 from sglang.srt.constrained import RegexGuide
 from sglang.srt.constrained.jump_forward import JumpForwardMap
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+if TYPE_CHECKING:
+    from sglang.srt.layers.sampler import SampleOutput
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
@@ -37,7 +41,7 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 global_server_args_dict = {
     "disable_flashinfer": False,
     "disable_flashinfer_sampling": False,
-    "attention_reduce_in_fp32": False,
+    "triton_attention_reduce_in_fp32": False,
     "enable_mla": False,
 }
@@ -235,10 +239,12 @@ class Req:
             return
         last_token_id = self.output_ids[-1]
-        if self.tokenizer is None:
-            matched_eos = last_token_id in self.sampling_params.stop_token_ids
-        else:
-            matched_eos = last_token_id == self.tokenizer.eos_token_id
+        matched_eos = last_token_id in self.sampling_params.stop_token_ids
+        if self.tokenizer is not None:
+            matched_eos |= last_token_id == self.tokenizer.eos_token_id
         if matched_eos and not self.sampling_params.ignore_eos:
             self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
             return
@@ -266,7 +272,7 @@ class Req:
         if all_ids[prompt_tokens - 1] != self.origin_input_ids_unpadded[-1]:
             # TODO(lsyin): fix token fusion
-            warnings.warn(
+            logger.warning(
                 "Token fusion between input and output, try to avoid this by removing the space at the end of the input."
             )
             return False
@@ -325,17 +331,13 @@ class ScheduleBatch:
     out_cache_loc: torch.Tensor = None
     extend_num_tokens: int = None
+    # For mixed chunekd prefill
+    prefix_lens_cpu: List[int] = None
     # For processing logprobs
     return_logprob: bool = False
     top_logprobs_nums: List[int] = None
-    # Batched sampling params
-    temperatures: torch.Tensor = None
-    top_ps: torch.Tensor = None
-    top_ks: torch.Tensor = None
-    penalizer_orchestrator: penaltylib.BatchedPenalizerOrchestrator = None
-    logit_bias: torch.Tensor = None
     @classmethod
     def init_new(cls, reqs, req_to_token_pool, token_to_kv_pool, tree_cache):
         return_logprob = any(req.return_logprob for req in reqs)
@@ -383,51 +385,7 @@ class ScheduleBatch:
         return out_cache_loc
-    def batch_sampling_params(self, vocab_size, int_token_logit_bias):
-        device = "cuda"
-        bs, reqs = self.batch_size(), self.reqs
-        self.temperatures = torch.tensor(
-            [r.sampling_params.temperature for r in reqs],
-            dtype=torch.float,
-            device=device,
-        ).view(-1, 1)
-        self.top_ps = torch.tensor(
-            [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
-        )
-        self.top_ks = torch.tensor(
-            [r.sampling_params.top_k for r in reqs], dtype=torch.int, device=device
-        )
-        # Each penalizers will do nothing if they evaluate themselves as not required by looking at
-        # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
-        # should not add hefty computation overhead other than simple checks.
-        #
-        # While we choose not to even create the class instances if they are not required, this
-        # could add additional complexity to the {ScheduleBatch} class, especially we need to
-        # handle {filter_batch()} and {merge()} cases as well.
-        self.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
-            vocab_size=vocab_size,
-            batch=self,
-            device=device,
-            Penalizers={
-                penaltylib.BatchedFrequencyPenalizer,
-                penaltylib.BatchedMinNewTokensPenalizer,
-                penaltylib.BatchedPresencePenalizer,
-                penaltylib.BatchedRepetitionPenalizer,
-            },
-        )
-        # Handle logit bias but only allocate when needed
-        self.logit_bias = None
-        for i in range(bs):
-            if reqs[i].sampling_params.dtype == "int":
-                if self.logit_bias is None:
-                    self.logit_bias = torch.zeros(
-                        (bs, vocab_size), dtype=torch.float32, device=device
-                    )
-                self.logit_bias[i][: len(int_token_logit_bias)] = int_token_logit_bias
-    def prepare_for_extend(self, vocab_size: int, int_token_logit_bias: torch.Tensor):
+    def prepare_for_extend(self, vocab_size: int):
         bs = self.batch_size()
         reqs = self.reqs
         input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
@@ -465,8 +423,32 @@ class ScheduleBatch:
         self.extend_num_tokens = extend_num_tokens
         self.out_cache_loc = out_cache_loc
         self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
+        self.prefix_lens_cpu = [len(r.prefix_indices) for r in reqs]
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(self, vocab_size)
+    def mix_with_running(self, running_batch: "ScheduleBatch"):
+        # NOTE: prefix_indices is what has been cached, but we don't cache each decode step
+        prefix_lens_cpu = [len(r.prefix_indices) for r in self.reqs]
+        prefix_lens_cpu.extend(
+            [
+                len(r.origin_input_ids) + len(r.output_ids) - 1
+                for r in running_batch.reqs
+            ]
+        )
+        for req in running_batch.reqs:
+            req.fill_ids = req.origin_input_ids + req.output_ids
+            req.extend_input_len = 1
-        self.batch_sampling_params(vocab_size, int_token_logit_bias)
+        input_ids = torch.cat([self.input_ids, running_batch.input_ids])
+        out_cache_loc = torch.cat([self.out_cache_loc, running_batch.out_cache_loc])
+        extend_num_tokens = self.extend_num_tokens + running_batch.batch_size()
+        self.merge(running_batch)
+        self.input_ids = input_ids
+        self.out_cache_loc = out_cache_loc
+        self.extend_num_tokens = extend_num_tokens
+        self.prefix_lens_cpu = prefix_lens_cpu
     def check_decode_mem(self):
         bs = self.batch_size()
@@ -639,7 +621,7 @@ class ScheduleBatch:
                 for r in self.reqs
             ]
         else:
-            self.penalizer_orchestrator.cumulate_input_tokens(input_ids)
+            self.sampling_info.penalizer_orchestrator.cumulate_input_tokens(input_ids)
         self.input_ids = torch.tensor(input_ids, dtype=torch.int32, device="cuda")
         self.seq_lens.add_(1)
@@ -652,6 +634,8 @@ class ScheduleBatch:
             self.req_pool_indices, self.seq_lens - 1
         ] = self.out_cache_loc
+        self.sampling_info.update_regex_vocab_mask(self)
     def filter_batch(self, unfinished_indices: List[int]):
         if unfinished_indices is None or len(unfinished_indices) == 0:
             # Filter out all requests
@@ -672,23 +656,13 @@ class ScheduleBatch:
         self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in unfinished_indices]
         self.return_logprob = any(req.return_logprob for req in self.reqs)
-        self.penalizer_orchestrator.filter(unfinished_indices, new_indices)
-        for item in [
-            "temperatures",
-            "top_ps",
-            "top_ks",
-            "logit_bias",
-        ]:
-            self_val = getattr(self, item, None)
-            if self_val is not None:  # logit_bias can be None
-                setattr(self, item, self_val[new_indices])
+        self.sampling_info.filter(unfinished_indices, new_indices)
     def merge(self, other: "ScheduleBatch"):
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
         # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it
         # needs to be called with pre-merged Batch.reqs.
-        self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+        self.sampling_info.merge(other.sampling_info)
         self.reqs.extend(other.reqs)
@@ -703,111 +677,17 @@ class ScheduleBatch:
         self.top_logprobs_nums.extend(other.top_logprobs_nums)
         self.return_logprob = any(req.return_logprob for req in self.reqs)
-        for item in [
-            "temperatures",
-            "top_ps",
-            "top_ks",
-        ]:
-            self_val = getattr(self, item, None)
-            other_val = getattr(other, item, None)
-            setattr(self, item, torch.concat([self_val, other_val]))
-        # logit_bias can be None
-        if self.logit_bias is not None or other.logit_bias is not None:
-            vocab_size = (
-                self.logit_bias.shape[1]
-                if self.logit_bias is not None
-                else other.logit_bias.shape[1]
-            )
-            if self.logit_bias is None:
-                self.logit_bias = torch.zeros(
-                    (len(self.reqs), vocab_size), dtype=torch.float32, device="cuda"
-                )
-            if other.logit_bias is None:
-                other.logit_bias = torch.zeros(
-                    (len(other.reqs), vocab_size), dtype=torch.float32, device="cuda"
-                )
-            self.logit_bias = torch.concat([self.logit_bias, other.logit_bias])
-    def sample(self, logits: torch.Tensor):
-        # TODO(lsyin): move this into a part of layer and run with CUDA Graph
-        # Post process logits
-        logits = logits.contiguous()
-        logits.div_(self.temperatures)
-        if self.logit_bias is not None:
-            logits.add_(self.logit_bias)
-        has_regex = any(req.regex_fsm is not None for req in self.reqs)
-        if has_regex:
-            allowed_mask = torch.empty_like(logits[0], dtype=torch.bool)
-            for i, req in enumerate(self.reqs):
-                if req.regex_fsm is not None:
-                    allowed_mask.zero_()
-                    allowed_mask[
-                        req.regex_fsm.get_next_instruction(req.regex_fsm_state).tokens
-                    ] = 1
-                    logits[i].masked_fill_(~allowed_mask, float("-inf"))
-        logits = self.penalizer_orchestrator.apply(logits)
-        probs = torch.softmax(logits, dim=-1)
-        if not global_server_args_dict["disable_flashinfer_sampling"]:
-            max_top_k_round, batch_size = 32, probs.shape[0]
-            uniform_samples = torch.rand(
-                (max_top_k_round, batch_size), device=probs.device
-            )
-            batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
-                probs, uniform_samples, self.top_ks, self.top_ps
-            )
-        else:
-            # Here we provide a slower fallback implementation.
-            batch_next_token_ids, success = top_k_top_p_sampling_from_probs_torch(
-                probs, self.top_ks, self.top_ps
-            )
-        if not torch.all(success):
-            warnings.warn("Sampling failed, fallback to top_k=1 strategy")
+    def check_sample_results(self, sample_output: SampleOutput):
+        if not torch.all(sample_output.success):
+            probs = sample_output.probs
+            batch_next_token_ids = sample_output.batch_next_token_ids
+            logging.warning("Sampling failed, fallback to top_k=1 strategy")
             probs = probs.masked_fill(torch.isnan(probs), 0.0)
             argmax_ids = torch.argmax(probs, dim=-1)
             batch_next_token_ids = torch.where(
-                success, batch_next_token_ids, argmax_ids
+                sample_output.success, batch_next_token_ids, argmax_ids
             )
+            sample_output.probs = probs
+            sample_output.batch_next_token_ids = batch_next_token_ids
-        if has_regex:
-            batch_next_token_ids_cpu = batch_next_token_ids.cpu().numpy()
-            for i, req in enumerate(self.reqs):
-                if req.regex_fsm is not None:
-                    req.regex_fsm_state = req.regex_fsm.get_next_state(
-                        req.regex_fsm_state, batch_next_token_ids_cpu[i]
-                    )
-        self.penalizer_orchestrator.cumulate_output_tokens(batch_next_token_ids)
-        return batch_next_token_ids
-def top_k_top_p_sampling_from_probs_torch(
-    probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor
-):
-    """A top-k and top-k sampling implementation with native pytorch operations."""
-    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
-    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
-    probs_sort[
-        torch.arange(0, probs.shape[-1], device=probs.device).view(1, -1)
-        >= top_ks.view(-1, 1)
-    ] = 0.0
-    probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0])
-    try:
-        sampled_index = torch.multinomial(probs_sort, num_samples=1)
-    except RuntimeError:
-        batch_next_token_ids = torch.zeros(
-            (probs_sort.shape[0],), dtype=torch.int32, device=probs.device
-        )
-        success = torch.zeros(probs.shape[0], dtype=torch.bool, device=probs.device)
-        return batch_next_token_ids, success
-    batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1)
-    success = torch.ones(probs.shape[0], dtype=torch.bool, device=probs.device)
-    return batch_next_token_ids, success
+        return sample_output.batch_next_token_ids

sglang 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

sglang 0.2.12py3-none-any.whl → 0.2.14py3-none-any.whl