PyPI - sglang - Versions diffs - 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

sglang 0.3.1.post3py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

sglang/__init__.py +2 -0
sglang/api.py +23 -1
sglang/bench_latency.py +48 -33
sglang/bench_server_latency.py +0 -6
sglang/bench_serving.py +2 -2
sglang/lang/backend/runtime_endpoint.py +14 -1
sglang/lang/interpreter.py +16 -6
sglang/lang/ir.py +20 -4
sglang/srt/configs/model_config.py +11 -9
sglang/srt/constrained/fsm_cache.py +9 -1
sglang/srt/constrained/jump_forward.py +15 -2
sglang/srt/hf_transformers_utils.py +1 -0
sglang/srt/layers/activation.py +4 -4
sglang/srt/layers/attention/__init__.py +49 -0
sglang/srt/layers/attention/flashinfer_backend.py +277 -0
sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
sglang/srt/layers/attention/triton_backend.py +161 -0
sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
sglang/srt/layers/fused_moe/patch.py +117 -0
sglang/srt/layers/layernorm.py +4 -4
sglang/srt/layers/logits_processor.py +19 -15
sglang/srt/layers/pooler.py +3 -3
sglang/srt/layers/quantization/__init__.py +0 -2
sglang/srt/layers/radix_attention.py +6 -4
sglang/srt/layers/sampler.py +6 -4
sglang/srt/layers/torchao_utils.py +18 -0
sglang/srt/lora/lora.py +20 -21
sglang/srt/lora/lora_manager.py +97 -25
sglang/srt/managers/detokenizer_manager.py +31 -18
sglang/srt/managers/image_processor.py +187 -0
sglang/srt/managers/io_struct.py +99 -75
sglang/srt/managers/schedule_batch.py +187 -68
sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
sglang/srt/managers/scheduler.py +1021 -0
sglang/srt/managers/tokenizer_manager.py +120 -247
sglang/srt/managers/tp_worker.py +28 -925
sglang/srt/mem_cache/memory_pool.py +34 -52
sglang/srt/mem_cache/radix_cache.py +5 -5
sglang/srt/model_executor/cuda_graph_runner.py +25 -25
sglang/srt/model_executor/forward_batch_info.py +94 -97
sglang/srt/model_executor/model_runner.py +76 -78
sglang/srt/models/baichuan.py +10 -10
sglang/srt/models/chatglm.py +12 -12
sglang/srt/models/commandr.py +10 -10
sglang/srt/models/dbrx.py +12 -12
sglang/srt/models/deepseek.py +10 -10
sglang/srt/models/deepseek_v2.py +14 -15
sglang/srt/models/exaone.py +10 -10
sglang/srt/models/gemma.py +10 -10
sglang/srt/models/gemma2.py +11 -11
sglang/srt/models/gpt_bigcode.py +10 -10
sglang/srt/models/grok.py +10 -10
sglang/srt/models/internlm2.py +10 -10
sglang/srt/models/llama.py +22 -10
sglang/srt/models/llama_classification.py +5 -5
sglang/srt/models/llama_embedding.py +4 -4
sglang/srt/models/llama_reward.py +142 -0
sglang/srt/models/llava.py +39 -33
sglang/srt/models/llavavid.py +31 -28
sglang/srt/models/minicpm.py +10 -10
sglang/srt/models/minicpm3.py +14 -15
sglang/srt/models/mixtral.py +10 -10
sglang/srt/models/mixtral_quant.py +10 -10
sglang/srt/models/olmoe.py +10 -10
sglang/srt/models/qwen.py +10 -10
sglang/srt/models/qwen2.py +11 -11
sglang/srt/models/qwen2_moe.py +10 -10
sglang/srt/models/stablelm.py +10 -10
sglang/srt/models/torch_native_llama.py +506 -0
sglang/srt/models/xverse.py +10 -10
sglang/srt/models/xverse_moe.py +10 -10
sglang/srt/openai_api/adapter.py +7 -0
sglang/srt/sampling/sampling_batch_info.py +36 -27
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +170 -119
sglang/srt/server_args.py +54 -27
sglang/srt/utils.py +101 -128
sglang/test/runners.py +76 -33
sglang/test/test_programs.py +38 -5
sglang/test/test_utils.py +53 -9
sglang/version.py +1 -1
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/METADATA +42 -23
sglang-0.3.3.dist-info/RECORD +139 -0
sglang/srt/layers/attention_backend.py +0 -482
sglang/srt/managers/controller_multi.py +0 -207
sglang/srt/managers/controller_single.py +0 -164
sglang-0.3.1.post3.dist-info/RECORD +0 -134
/sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
/sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -18,7 +18,6 @@ The definition of objects transfered between different
 processes (TokenizerManager, DetokenizerManager, Controller).
 """
-import copy
 import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
@@ -37,7 +36,7 @@ class GenerateReqInput:
     # See also python/sglang/srt/utils.py:load_image.
     image_data: Optional[Union[List[str], str]] = None
     # The sampling_params. See descriptions below.
-    sampling_params: Union[List[Dict], Dict] = None
+    sampling_params: Optional[Union[List[Dict], Dict]] = None
     # The request id.
     rid: Optional[Union[List[str], str]] = None
     # Whether to return logprobs.
@@ -53,9 +52,6 @@ class GenerateReqInput:
     stream: bool = False
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
-    is_single: bool = True
     # LoRA related
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
@@ -65,19 +61,41 @@ class GenerateReqInput:
         ):
             raise ValueError("Either text or input_ids should be provided.")
-        if (
-            isinstance(self.sampling_params, dict)
-            and self.sampling_params.get("n", 1) != 1
-        ):
-            is_single = False
+        self.is_single = False
+        if self.text is not None:
+            if isinstance(self.text, str):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.batch_size = len(self.text)
         else:
-            if self.text is not None:
-                is_single = isinstance(self.text, str)
+            if isinstance(self.input_ids[0], int):
+                self.is_single = True
+                self.batch_size = 1
             else:
-                is_single = isinstance(self.input_ids[0], int)
-        self.is_single = is_single
-        if is_single:
+                self.batch_size = len(self.input_ids)
+        if self.sampling_params is None:
+            self.parallel_sample_num = 1
+        elif isinstance(self.sampling_params, dict):
+            self.parallel_sample_num = self.sampling_params.get("n", 1)
+        else:  # isinstance(self.sampling_params, list):
+            self.parallel_sample_num = self.sampling_params[0].get("n", 1)
+            for sp in self.sampling_params:
+                # TODO cope with the case that the parallel_sample_num is different for different samples
+                assert self.parallel_sample_num == sp.get(
+                    "n", 1
+                ), "The parallel_sample_num should be the same for all samples in sample params."
+        if self.parallel_sample_num > 1:
+            if self.is_single:
+                self.is_single = False
+                if self.text is not None:
+                    self.text = [self.text]
+                if self.input_ids is not None:
+                    self.input_ids = [self.input_ids]
+        if self.is_single:
             if self.sampling_params is None:
                 self.sampling_params = {}
             if self.rid is None:
@@ -89,79 +107,54 @@ class GenerateReqInput:
             if self.top_logprobs_num is None:
                 self.top_logprobs_num = 0
         else:
-            parallel_sample_num_list = []
-            if isinstance(self.sampling_params, dict):
-                parallel_sample_num = self.sampling_params.get("n", 1)
-            elif isinstance(self.sampling_params, list):
-                for sp in self.sampling_params:
-                    parallel_sample_num = sp.get("n", 1)
-                    parallel_sample_num_list.append(parallel_sample_num)
-                parallel_sample_num = max(parallel_sample_num_list)
-                all_equal = all(
-                    element == parallel_sample_num
-                    for element in parallel_sample_num_list
-                )
-                if parallel_sample_num > 1 and (not all_equal):
-                    # TODO cope with the case that the parallel_sample_num is different for different samples
-                    raise ValueError(
-                        "The parallel_sample_num should be the same for all samples in sample params."
-                    )
+            if self.parallel_sample_num == 1:
+                num = self.batch_size
             else:
-                parallel_sample_num = 1
-            self.parallel_sample_num = parallel_sample_num
-            if parallel_sample_num != 1:
-                # parallel sampling +1 represents the original prefill stage
-                num = parallel_sample_num + 1
-                if isinstance(self.text, list):
-                    # suppot batch operation
-                    self.batch_size = len(self.text)
-                    num = num * len(self.text)
-                elif isinstance(self.input_ids, list) and isinstance(
-                    self.input_ids[0], list
-                ):
-                    self.batch_size = len(self.input_ids)
-                    num = num * len(self.input_ids)
-                else:
-                    self.batch_size = 1
-            else:
-                # support select operation
-                num = len(self.text) if self.text is not None else len(self.input_ids)
-                self.batch_size = num
+                # FIXME support cascade inference
+                # first bs samples are used for caching the prefix for parallel sampling
+                num = self.batch_size + self.parallel_sample_num * self.batch_size
             if self.image_data is None:
                 self.image_data = [None] * num
             elif not isinstance(self.image_data, list):
                 self.image_data = [self.image_data] * num
             elif isinstance(self.image_data, list):
-                # multi-image with n > 1
+                # FIXME incorrect order for duplication
                 self.image_data = self.image_data * num
             if self.sampling_params is None:
                 self.sampling_params = [{}] * num
             elif not isinstance(self.sampling_params, list):
                 self.sampling_params = [self.sampling_params] * num
+            else:
+                assert self.parallel_sample_num == 1
             if self.rid is None:
                 self.rid = [uuid.uuid4().hex for _ in range(num)]
             else:
-                if not isinstance(self.rid, list):
-                    raise ValueError("The rid should be a list.")
+                assert isinstance(self.rid, list), "The rid should be a list."
+                assert self.parallel_sample_num == 1
             if self.return_logprob is None:
                 self.return_logprob = [False] * num
             elif not isinstance(self.return_logprob, list):
                 self.return_logprob = [self.return_logprob] * num
+            else:
+                assert self.parallel_sample_num == 1
             if self.logprob_start_len is None:
                 self.logprob_start_len = [-1] * num
             elif not isinstance(self.logprob_start_len, list):
                 self.logprob_start_len = [self.logprob_start_len] * num
+            else:
+                assert self.parallel_sample_num == 1
             if self.top_logprobs_num is None:
                 self.top_logprobs_num = [0] * num
             elif not isinstance(self.top_logprobs_num, list):
                 self.top_logprobs_num = [self.top_logprobs_num] * num
+            else:
+                assert self.parallel_sample_num == 1
 @dataclass
@@ -172,12 +165,8 @@ class TokenizedGenerateReqInput:
     input_text: str
     # The input token ids
     input_ids: List[int]
-    # The pixel values for input images
-    pixel_values: List[float]
-    # The hash values of input images
-    image_hashes: List[int]
-    # The image sizes
-    image_sizes: List[List[int]]
+    # The image input
+    image_inputs: dict
     # The sampling parameters
     sampling_params: SamplingParams
     # Whether to return the logprobs
@@ -188,8 +177,6 @@ class TokenizedGenerateReqInput:
     top_logprobs_num: int
     # Whether to stream output
     stream: bool
-    # Modalities of the input images
-    modalites: Optional[List[str]] = None
     # LoRA related
     lora_path: Optional[str] = None  # None means just use the base model
@@ -206,8 +193,6 @@ class EmbeddingReqInput:
     # Dummy sampling params for compatibility
     sampling_params: Union[List[Dict], Dict] = None
-    is_single: bool = True
     def post_init(self):
         if (self.text is None and self.input_ids is None) or (
             self.text is not None and self.input_ids is not None
@@ -215,12 +200,11 @@ class EmbeddingReqInput:
             raise ValueError("Either text or input_ids should be provided.")
         if self.text is not None:
-            is_single = isinstance(self.text, str)
+            self.is_single = isinstance(self.text, str)
         else:
-            is_single = isinstance(self.input_ids[0], int)
-        self.is_single = is_single
+            self.is_single = isinstance(self.input_ids[0], int)
-        if is_single:
+        if self.is_single:
             if self.rid is None:
                 self.rid = uuid.uuid4().hex
             if self.sampling_params is None:
@@ -254,6 +238,50 @@ class TokenizedEmbeddingReqInput:
     sampling_params: SamplingParams
+@dataclass
+class RewardReqInput:
+    # The input prompt in the chat format. It can be a single prompt or a batch of prompts.
+    conv: Union[List[List[Dict]], List[Dict]]
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Dummy sampling params for compatibility
+    sampling_params: Union[List[Dict], Dict] = None
+    def post_init(self):
+        self.is_single = isinstance(self.conv[0], dict)
+        if self.is_single:
+            if self.rid is None:
+                self.rid = uuid.uuid4().hex
+            if self.sampling_params is None:
+                self.sampling_params = {}
+            self.sampling_params["max_new_tokens"] = 1
+        else:
+            # support select operation
+            self.batch_size = len(self.conv)
+            if self.rid is None:
+                self.rid = [uuid.uuid4().hex for _ in range(self.batch_size)]
+            else:
+                if not isinstance(self.rid, list):
+                    raise ValueError("The rid should be a list.")
+            if self.sampling_params is None:
+                self.sampling_params = [{}] * self.batch_size
+            for i in range(self.batch_size):
+                self.sampling_params[i]["max_new_tokens"] = 1
+@dataclass
+class TokenizedRewardReqInput:
+    # The request id
+    rid: str
+    # The input text
+    input_text: str
+    # The input token ids
+    input_ids: List[int]
+    # Dummy sampling params for compatibility
+    sampling_params: SamplingParams
 @dataclass
 class BatchTokenIDOut:
     # The request id
@@ -268,10 +296,6 @@ class BatchTokenIDOut:
     meta_info: List[Dict]
     finished_reason: List[BaseFinishReason]
-    def __post_init__(self):
-        # deepcopy meta_info to avoid modification in place
-        self.meta_info = copy.deepcopy(self.meta_info)
 @dataclass
 class BatchStrOut:

sglang 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl

sglang 0.3.1.post3py3-none-any.whl → 0.3.3py3-none-any.whl