PyPI - sglang - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

sglang 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/__init__.py +33 -26
sglang/api.py +9 -1
sglang/bench_latency.py +2 -2
sglang/bench_serving.py +10 -1
sglang/check_env.py +1 -1
sglang/lang/backend/litellm.py +1 -1
sglang/lang/backend/openai.py +1 -1
sglang/lang/backend/runtime_endpoint.py +4 -4
sglang/lang/interpreter.py +24 -9
sglang/lang/ir.py +1 -1
sglang/srt/constrained/__init__.py +15 -0
sglang/srt/constrained/base_cache.py +15 -0
sglang/srt/constrained/fsm_cache.py +36 -1
sglang/srt/constrained/jump_forward.py +15 -0
sglang/srt/conversation.py +26 -0
sglang/srt/hf_transformers_utils.py +18 -1
sglang/srt/layers/context_flashattention_nopad.py +15 -0
sglang/srt/layers/extend_attention.py +15 -0
sglang/srt/layers/fused_moe.py +15 -0
sglang/srt/layers/linear.py +15 -0
sglang/srt/layers/logits_processor.py +109 -72
sglang/srt/layers/quantization/__init__.py +15 -0
sglang/srt/layers/quantization/fp8.py +15 -0
sglang/srt/layers/radix_attention.py +21 -3
sglang/srt/layers/token_attention.py +16 -1
sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
sglang/srt/managers/detokenizer_manager.py +16 -1
sglang/srt/managers/io_struct.py +38 -5
sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +85 -25
sglang/srt/managers/tokenizer_manager.py +99 -57
sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +177 -81
sglang/srt/mem_cache/flush_cache.py +33 -0
sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
sglang/srt/mm_utils.py +15 -0
sglang/srt/model_config.py +20 -0
sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +42 -18
sglang/srt/{managers/controller → model_executor}/model_runner.py +51 -16
sglang/srt/model_loader/model_loader.py +15 -0
sglang/srt/model_loader/utils.py +16 -1
sglang/srt/models/chatglm.py +16 -1
sglang/srt/models/commandr.py +16 -1
sglang/srt/models/dbrx.py +16 -1
sglang/srt/models/deepseek.py +16 -1
sglang/srt/models/deepseek_v2.py +532 -0
sglang/srt/models/gemma.py +16 -1
sglang/srt/models/gemma2.py +16 -1
sglang/srt/models/gpt_bigcode.py +16 -1
sglang/srt/models/grok.py +16 -1
sglang/srt/models/internlm2.py +16 -1
sglang/srt/models/llama2.py +16 -1
sglang/srt/models/llama_classification.py +19 -4
sglang/srt/models/llava.py +17 -2
sglang/srt/models/llavavid.py +17 -2
sglang/srt/models/minicpm.py +16 -1
sglang/srt/models/mistral.py +15 -0
sglang/srt/models/mixtral.py +16 -1
sglang/srt/models/mixtral_quant.py +16 -1
sglang/srt/models/qwen.py +16 -1
sglang/srt/models/qwen2.py +16 -1
sglang/srt/models/qwen2_moe.py +16 -1
sglang/srt/models/stablelm.py +16 -1
sglang/srt/models/yivl.py +15 -0
sglang/srt/openai_api/adapter.py +545 -160
sglang/srt/openai_api/protocol.py +65 -1
sglang/srt/sampling_params.py +20 -4
sglang/srt/server.py +90 -37
sglang/srt/server_args.py +76 -17
sglang/srt/utils.py +15 -0
sglang/test/test_programs.py +5 -1
sglang/utils.py +22 -0
sglang/version.py +1 -1
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/METADATA +40 -12
sglang-0.2.7.dist-info/RECORD +93 -0
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
sglang/srt/flush_cache.py +0 -18
sglang-0.2.5.dist-info/RECORD +0 -92
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
{sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -1,7 +1,22 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Logits processing."""
 import dataclasses
-from typing import List, Union
+from typing import List, Optional, Union
 import torch
 from torch import nn
@@ -10,7 +25,7 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
 )
-from sglang.srt.managers.controller.model_runner import ForwardMode, InputMetadata
+from sglang.srt.model_executor.model_runner import ForwardMode, InputMetadata
 @dataclasses.dataclass
@@ -22,23 +37,23 @@ class LogitProcessorOutput:
     # The normlaized logprobs of prompts.  shape: [#seq]
     normalized_prompt_logprobs: torch.Tensor
-    # The logprobs of prefill tokens.      shape: [#token, vocab_size]
-    prefill_token_logprobs: torch.Tensor
+    # The logprobs of input tokens.      shape: [#token, vocab_size]
+    input_token_logprobs: torch.Tensor
-    # The logprob and id of the top-k tokens in prefill positions.  shape [#seq, #token, k] of Tuple(logprob, token_id)
-    prefill_top_logprobs: List
-    # The logprob and id of the top-k tokens in decode positions.   shape [#seq, #token, k] of Tuple(logprob, token_id)
-    decode_top_logprobs: List
+    # The logprob and id of the top-k tokens in input positions.  shape [#seq, #token, k] of Tuple(logprob, token_id)
+    input_top_logprobs: List
+    # The logprob and id of the top-k tokens in output positions. shape [#seq, #token, k] of Tuple(logprob, token_id)
+    output_top_logprobs: List
 @dataclasses.dataclass
 class LogitsMetadata:
     forward_mode: ForwardMode
-    return_logprob: bool
+    return_logprob: bool = False
-    extend_seq_lens: torch.Tensor = None
-    extend_start_loc: torch.Tensor = None
-    top_logprobs_nums: List[int] = None
+    extend_seq_lens: Optional[torch.Tensor] = None
+    extend_start_loc: Optional[torch.Tensor] = None
+    top_logprobs_nums: Optional[List[int]] = None
     @classmethod
     def from_input_metadata(cls, input_metadata: InputMetadata):
@@ -58,20 +73,16 @@ class LogitsProcessor(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
     def _get_normalized_prompt_logprobs(
-        self, prefill_token_logprobs, logits_metadata: LogitsMetadata
+        self, input_token_logprobs, logits_metadata: LogitsMetadata
     ):
-        logprobs_cumsum = torch.cumsum(
-            prefill_token_logprobs, dim=0, dtype=torch.float32
-        )
+        logprobs_cumsum = torch.cumsum(input_token_logprobs, dim=0, dtype=torch.float32)
         start = logits_metadata.extend_start_loc.clone()
         end = start + logits_metadata.extend_seq_lens - 2
-        start.clamp_(min=0, max=prefill_token_logprobs.shape[0] - 1)
-        end.clamp_(min=0, max=prefill_token_logprobs.shape[0] - 1)
+        start.clamp_(min=0, max=input_token_logprobs.shape[0] - 1)
+        end.clamp_(min=0, max=input_token_logprobs.shape[0] - 1)
         sum_logp = (
-            logprobs_cumsum[end]
-            - logprobs_cumsum[start]
-            + prefill_token_logprobs[start]
+            logprobs_cumsum[end] - logprobs_cumsum[start] + input_token_logprobs[start]
         )
         normalized_prompt_logprobs = sum_logp / (
             (logits_metadata.extend_seq_lens - 1).clamp(min=1)
@@ -79,37 +90,51 @@ class LogitsProcessor(nn.Module):
         return normalized_prompt_logprobs
-    def _get_top_logprobs(self, all_logprobs, logits_metadata: LogitsMetadata):
-        # TODO: vectorize the code below
+    @staticmethod
+    def get_top_logprobs(all_logprobs, logits_metadata: LogitsMetadata):
         if logits_metadata.forward_mode == ForwardMode.DECODE:
-            decode_top_logprobs = []
-            for i in range(all_logprobs.shape[0]):
-                k = logits_metadata.top_logprobs_nums[i]
-                t = all_logprobs[i].topk(k)
-                v_cpu = t.values.tolist()
-                p_cpu = t.indices.tolist()
-                decode_top_logprobs.append(list(zip(v_cpu, p_cpu)))
-            return None, decode_top_logprobs
+            output_top_logprobs = []
+            max_k = max(logits_metadata.top_logprobs_nums)
+            ret = all_logprobs.topk(max_k, dim=1)
+            values = ret.values.tolist()
+            indices = ret.indices.tolist()
+            for i, k in enumerate(logits_metadata.top_logprobs_nums):
+                output_top_logprobs.append(list(zip(values[i][:k], indices[i][:k])))
+            return None, output_top_logprobs
         else:
-            prefill_top_logprobs, decode_top_logprobs = [], []
+            # TODO: vectorize the code below
+            input_top_logprobs, output_top_logprobs = [], []
             pt = 0
             extend_seq_lens_cpu = logits_metadata.extend_seq_lens.tolist()
+            max_k = max(logits_metadata.top_logprobs_nums)
+            ret = all_logprobs.topk(max_k, dim=1)
+            values = ret.values.tolist()
+            indices = ret.indices.tolist()
             for i, extend_seq_len in enumerate(extend_seq_lens_cpu):
                 if extend_seq_len == 0:
-                    prefill_top_logprobs.append([])
-                    decode_top_logprobs.append([])
+                    input_top_logprobs.append([])
+                    output_top_logprobs.append([])
                     continue
                 k = logits_metadata.top_logprobs_nums[i]
-                t = all_logprobs[pt : pt + extend_seq_len].topk(k)
-                vs_cpu = t.values.tolist()
-                ps_cpu = t.indices.tolist()
-                prefill_top_logprobs.append(
-                    [list(zip(vs_cpu[j], ps_cpu[j])) for j in range(len(vs_cpu) - 1)]
+                input_top_logprobs.append(
+                    [
+                        list(zip(values[pt + j][:k], indices[pt + j][:k]))
+                        for j in range(extend_seq_len - 1)
+                    ]
+                )
+                output_top_logprobs.append(
+                    list(
+                        zip(
+                            values[pt + extend_seq_len - 1][:k],
+                            indices[pt + extend_seq_len - 1][:k],
+                        )
+                    )
                 )
-                decode_top_logprobs.append(list(zip(vs_cpu[-1], ps_cpu[-1])))
                 pt += extend_seq_len
-            return prefill_top_logprobs, decode_top_logprobs
+            return input_top_logprobs, output_top_logprobs
     def forward(
         self,
@@ -136,7 +161,7 @@ class LogitsProcessor(nn.Module):
         last_logits = torch.matmul(last_hidden, weight.T)
         if self.tp_size > 1:
             last_logits = tensor_model_parallel_all_gather(last_logits)
-        last_logits = last_logits[:, : self.config.vocab_size]
+        last_logits = last_logits[:, : self.config.vocab_size].float()
         if hasattr(self.config, "final_logit_softcapping"):
             last_logits /= self.config.final_logit_softcapping
@@ -149,63 +174,75 @@ class LogitsProcessor(nn.Module):
                 next_token_logits=last_logits,
                 next_token_logprobs=None,
                 normalized_prompt_logprobs=None,
-                prefill_token_logprobs=None,
-                prefill_top_logprobs=None,
-                decode_top_logprobs=None,
+                input_token_logprobs=None,
+                input_top_logprobs=None,
+                output_top_logprobs=None,
             )
         else:
             # When logprob is requested, compute the logits for all tokens.
             if logits_metadata.forward_mode == ForwardMode.DECODE:
-                all_logits = last_logits
-            else:
-                all_logits = torch.matmul(hidden_states, weight.T)
-                if self.tp_size > 1:
-                    all_logits = tensor_model_parallel_all_gather(all_logits)
-                all_logits = all_logits[:, : self.config.vocab_size]
+                last_logprobs = torch.nn.functional.log_softmax(last_logits, dim=-1)
-            all_logprobs = all_logits.float()
-            del all_logits
-            all_logprobs[:] = torch.nn.functional.log_softmax(all_logprobs, dim=-1)
-            # Get the logprob of top-k tokens
-            return_top_logprob = any(x > 0 for x in logits_metadata.top_logprobs_nums)
-            if return_top_logprob:
-                prefill_top_logprobs, decode_top_logprobs = self._get_top_logprobs(
-                    all_logprobs, logits_metadata
+                # Get the logprob of top-k tokens
+                return_top_logprob = any(
+                    x > 0 for x in logits_metadata.top_logprobs_nums
                 )
-            else:
-                prefill_top_logprobs = decode_top_logprobs = None
+                if return_top_logprob:
+                    output_top_logprobs = self.get_top_logprobs(
+                        last_logprobs, logits_metadata
+                    )[1]
+                else:
+                    output_top_logprobs = None
-            if logits_metadata.forward_mode == ForwardMode.DECODE:
                 return LogitProcessorOutput(
                     next_token_logits=last_logits,
-                    next_token_logprobs=all_logprobs,
+                    next_token_logprobs=last_logprobs,
                     normalized_prompt_logprobs=None,
-                    prefill_token_logprobs=None,
-                    prefill_top_logprobs=None,
-                    decode_top_logprobs=decode_top_logprobs,
+                    input_token_logprobs=None,
+                    input_top_logprobs=None,
+                    output_top_logprobs=output_top_logprobs,
                 )
             else:
+                all_logits = torch.matmul(hidden_states, weight.T)
+                if self.tp_size > 1:
+                    all_logits = tensor_model_parallel_all_gather(all_logits)
+                all_logits = all_logits[:, : self.config.vocab_size].float()
+                all_logprobs = all_logits
+                del all_logits
+                all_logprobs[:] = torch.nn.functional.log_softmax(all_logprobs, dim=-1)
+                # Get the logprob of top-k tokens
+                return_top_logprob = any(
+                    x > 0 for x in logits_metadata.top_logprobs_nums
+                )
+                if return_top_logprob:
+                    input_top_logprobs, output_top_logprobs = self.get_top_logprobs(
+                        all_logprobs, logits_metadata
+                    )
+                else:
+                    input_top_logprobs = output_top_logprobs = None
                 last_logprobs = all_logprobs[last_index]
                 # Compute the logprobs and normalized logprobs for the prefill tokens.
                 # Note that we pad a zero at the end of each sequence for easy computation.
-                prefill_token_logprobs = all_logprobs[
+                input_token_logprobs = all_logprobs[
                     torch.arange(all_logprobs.shape[0], device="cuda"),
                     torch.cat([input_ids[1:], torch.tensor([0], device="cuda")]),
                 ]
                 normalized_prompt_logprobs = self._get_normalized_prompt_logprobs(
-                    prefill_token_logprobs, logits_metadata
+                    input_token_logprobs, logits_metadata
                 )
                 return LogitProcessorOutput(
                     next_token_logits=last_logits,
                     next_token_logprobs=last_logprobs,
                     normalized_prompt_logprobs=normalized_prompt_logprobs,
-                    prefill_token_logprobs=prefill_token_logprobs,
-                    prefill_top_logprobs=prefill_top_logprobs,
-                    decode_top_logprobs=decode_top_logprobs,
+                    input_token_logprobs=input_token_logprobs,
+                    input_top_logprobs=input_top_logprobs,
+                    output_top_logprobs=output_top_logprobs,
                 )

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # temporarily adapted from vLLM
 # FIXME: in progress of refactoring the model loader

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # adapted from https://github.com/vllm-project/vllm/blob/e76466dde2bc9525d55165ceaa600d298c7bf773/vllm/model_executor/layers/quantization/fp8.py
 # FIXME refactor in progress
 from typing import Any, Dict, List, Optional, Union

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """Radix attention."""
 import torch
@@ -7,8 +22,11 @@ from torch import nn
 from sglang.global_config import global_config
 from sglang.srt.layers.extend_attention import extend_attention_fwd
 from sglang.srt.layers.token_attention import token_attention_fwd
-from sglang.srt.managers.controller.model_runner import ForwardMode, InputMetadata
-from sglang.srt.server import global_server_args_dict
+from sglang.srt.model_executor.model_runner import (
+    ForwardMode,
+    InputMetadata,
+    global_server_args_dict,
+)
 class RadixAttention(nn.Module):
@@ -85,7 +103,7 @@ class RadixAttention(nn.Module):
         return o
     def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
-        if not input_metadata.use_ragged:
+        if not input_metadata.flashinfer_use_ragged:
             self.store_kv_cache(k, v, input_metadata)
             o = input_metadata.flashinfer_prefill_wrapper_paged.forward(

sglang/srt/layers/token_attention.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
@@ -5,7 +20,7 @@ import torch
 import triton
 import triton.language as tl
-from sglang.srt.server import global_server_args_dict
+from sglang.srt.managers.schedule_batch import global_server_args_dict
 if global_server_args_dict.get("attention_reduce_in_fp32", False):
     REDUCE_TRITON_TYPE = tl.float32

sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 A controller that manages multiple data parallel workers.
 Each data parallel worker can manage multiple tensor parallel workers.
@@ -12,7 +27,7 @@ from enum import Enum, auto
 import numpy as np
 import zmq
-from sglang.srt.managers.controller.manager_single import (
+from sglang.srt.managers.controller_single import (
     start_controller_process as start_controller_process_single,
 )
 from sglang.srt.managers.io_struct import (
@@ -24,7 +39,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import kill_parent_process
 from sglang.utils import get_exception_traceback
-logger = logging.getLogger("srt.controller")
+logger = logging.getLogger(__name__)
 class LoadBalanceMethod(Enum):

sglang/srt/managers/{controller/manager_single.py → controller_single.py} RENAMED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """A controller that manages a group of tensor parallel workers."""
 import logging
@@ -7,7 +22,7 @@ from typing import List
 import zmq
-from sglang.srt.managers.controller.tp_worker import (
+from sglang.srt.managers.tp_worker import (
     ModelTpServer,
     broadcast_recv_input,
     launch_tp_servers,
@@ -16,7 +31,7 @@ from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import kill_parent_process
 from sglang.utils import get_exception_traceback
-logger = logging.getLogger("srt.controller")
+logger = logging.getLogger(__name__)
 class ControllerSingle:

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """DetokenizerManager is a process that detokenizes the token ids."""
 import asyncio
@@ -10,8 +25,8 @@ import zmq
 import zmq.asyncio
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.controller.infer_batch import FINISH_MATCHED_STR
 from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut
+from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.utils import find_printable_text, get_exception_traceback, graceful_registry

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -1,3 +1,18 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
 """
 The definition of objects transfered between different
 processes (TokenizerManager, DetokenizerManager, Controller).
@@ -7,7 +22,7 @@ import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
-from sglang.srt.managers.controller.infer_batch import BaseFinishReason
+from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling_params import SamplingParams
@@ -20,7 +35,7 @@ class GenerateReqInput:
     # The image input. It can be a file name, a url, or base64 encoded string.
     # See also python/sglang/srt/utils.py:load_image.
     image_data: Optional[Union[List[str], str]] = None
-    # The sampling_params.
+    # The sampling_params. See descriptions below.
     sampling_params: Union[List[Dict], Dict] = None
     # The request id.
     rid: Optional[Union[List[str], str]] = None
@@ -30,7 +45,7 @@ class GenerateReqInput:
     logprob_start_len: Optional[Union[List[int], int]] = None
     # The number of top logprobs to return.
     top_logprobs_num: Optional[Union[List[int], int]] = None
-    # Whether to detokenize tokens in logprobs.
+    # Whether to detokenize tokens in text in the returned logprobs.
     return_text_in_logprobs: bool = False
     # Whether to stream output.
     stream: bool = False
@@ -64,8 +79,26 @@ class GenerateReqInput:
             if self.top_logprobs_num is None:
                 self.top_logprobs_num = 0
         else:
-            parallel_sample_num = self.sampling_params.get("n", 1)
+            parallel_sample_num_list = []
+            if isinstance(self.sampling_params, dict):
+                parallel_sample_num = self.sampling_params.get("n", 1)
+            elif isinstance(self.sampling_params, list):
+                for sp in self.sampling_params:
+                    parallel_sample_num = sp.get("n", 1)
+                    parallel_sample_num_list.append(parallel_sample_num)
+                parallel_sample_num = max(parallel_sample_num_list)
+                all_equal = all(
+                    element == parallel_sample_num
+                    for element in parallel_sample_num_list
+                )
+                if parallel_sample_num > 1 and (not all_equal):
+                    ## TODO cope with the case that the parallel_sample_num is different for different samples
+                    raise ValueError(
+                        "The parallel_sample_num should be the same for all samples in sample params."
+                    )
+            else:
+                parallel_sample_num = 1
+            self.parallel_sample_num = parallel_sample_num
             if parallel_sample_num != 1:
                 # parallel sampling +1 represents the original prefill stage

sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} RENAMED Viewed

@@ -1,46 +1,61 @@
-"""Request scheduler heuristic."""
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""Request policy scheduler"""
 import random
 from collections import defaultdict
-class ScheduleHeuristic:
+class PolicyScheduler:
     def __init__(
         self,
-        schedule_heuristic,
+        policy,
         max_running_seqs,
         max_prefill_num_tokens,
         max_total_num_tokens,
         tree_cache,
     ):
-        if tree_cache.disable and schedule_heuristic == "lpm":
+        if tree_cache.disable and policy == "lpm":
             # LMP is meaningless when the tree cache is disabled.
-            schedule_heuristic = "fcfs"
+            policy = "fcfs"
-        self.schedule_heuristic = schedule_heuristic
+        self.policy = policy
         self.max_running_seqs = max_running_seqs
         self.max_prefill_num_tokens = max_prefill_num_tokens
         self.max_total_num_tokens = max_total_num_tokens
         self.tree_cache = tree_cache
-    def get_priority_queue(self, forward_queue):
-        if self.schedule_heuristic == "lpm":
+    def get_priority_queue(self, waiting_queue):
+        if self.policy == "lpm":
             # longest prefix match
-            forward_queue.sort(key=lambda x: -len(x.prefix_indices))
-            return forward_queue
-        elif self.schedule_heuristic == "fcfs":
+            waiting_queue.sort(key=lambda x: -len(x.prefix_indices))
+            return waiting_queue
+        elif self.policy == "fcfs":
             # first come first serve
-            return forward_queue
-        elif self.schedule_heuristic == "lof":
+            return waiting_queue
+        elif self.policy == "lof":
             # longest output first
-            forward_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
-            return forward_queue
-        elif self.schedule_heuristic == "random":
-            random.shuffle(forward_queue)
-            return forward_queue
-        elif self.schedule_heuristic == "dfs-weight":
+            waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
+            return waiting_queue
+        elif self.policy == "random":
+            random.shuffle(waiting_queue)
+            return waiting_queue
+        elif self.policy == "dfs-weight":
             last_node_to_reqs = defaultdict(list)
-            for req in forward_queue:
+            for req in waiting_queue:
                 last_node_to_reqs[req.last_node].append(req)
             node_to_weight = defaultdict(int)
@@ -52,10 +67,10 @@ class ScheduleHeuristic:
             self.get_dfs_priority(
                 self.tree_cache.root_node, node_to_weight, last_node_to_reqs, q
             )
-            assert len(q) == len(forward_queue)
+            assert len(q) == len(waiting_queue)
             return q
         else:
-            raise ValueError(f"Unknown schedule_heuristic: {self.schedule_heuristic}")
+            raise ValueError(f"Unknown schedule_policy: {self.policy}")
     def calc_weight(self, cur_node, node_to_weight):
         for child in cur_node.children.values():

sglang 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

sglang 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl