PyPI - sglang - Versions diffs - 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +48 -20
sglang/bench_one_batch.py +472 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +125 -6
sglang/check_env.py +3 -6
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +28 -17
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +47 -58
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +16 -13
sglang/srt/layers/attention/flashinfer_backend.py +106 -54
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +25 -0
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +17 -15
sglang/srt/layers/logits_processor.py +23 -25
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +98 -27
sglang/srt/managers/detokenizer_manager.py +13 -15
sglang/srt/managers/io_struct.py +63 -21
sglang/srt/managers/schedule_batch.py +154 -59
sglang/srt/managers/schedule_policy.py +18 -16
sglang/srt/managers/scheduler.py +278 -109
sglang/srt/managers/session_controller.py +61 -0
sglang/srt/managers/tokenizer_manager.py +63 -18
sglang/srt/managers/tp_worker.py +25 -16
sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +63 -25
sglang/srt/model_executor/forward_batch_info.py +128 -32
sglang/srt/model_executor/model_runner.py +132 -64
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +162 -59
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +31 -25
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +14 -16
sglang/srt/models/llavavid.py +14 -16
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +22 -20
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +107 -93
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +19 -17
sglang/srt/openai_api/protocol.py +14 -16
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +61 -57
sglang/srt/sampling/sampling_params.py +14 -16
sglang/srt/server.py +86 -35
sglang/srt/server_args.py +96 -80
sglang/srt/utils.py +266 -68
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +38 -20
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +31 -20
sglang/version.py +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
sglang-0.3.6.post1.dist-info/RECORD +164 -0
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.5.post2.dist-info/RECORD +0 -156
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/models/stablelm.py CHANGED Viewed

@@ -1,22 +1,24 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Adapted from:
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/stablelm.py#L1
-"""Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
-model compatible with HuggingFace weights."""
+"""
+Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
+model compatible with HuggingFace weights.
+"""
 from typing import Iterable, Optional, Tuple
 import torch

sglang/srt/models/torch_native_llama.py CHANGED Viewed

@@ -1,21 +1,44 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
-"""Inference-only LLaMA model compatible with HuggingFace weights."""
+"""
+Inference-only LLaMA model compatible with HuggingFace weights.
+This model supports tensor parallelism (TP) using the PyTorch tensor parallel package.
+Reference: https://pytorch.org/docs/stable/distributed.tensor.parallel.html
+Here is a quick example to enable TP:
+```python
+from sglang.srt.model_parallel import tensor_parallel
+device_mesh = torch.distributed.init_device_mesh("cuda", (tp_size,))
+tensor_parallel(model, device_mesh)
+```
+An end-to-end example can be found in `python/sglang/bench_one_batch.py`.
+You can run it with the following command:
+```bash
+$ python3 -m sglang.bench_one_batch --correct \
+  --model meta-llama/Meta-Llama-3-8B \
+  --json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}' \
+  --tensor-parallel-size 2 \
+  --disable-cuda-graph
+```
+We will eanble CUDA Graph support soon.
+"""
 import types
 from typing import Any, Dict, Iterable, Optional, Tuple
@@ -24,7 +47,10 @@ import torch
 from torch import nn
 from torch.nn.parameter import Parameter
 from transformers import LlamaConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -41,35 +67,45 @@ from sglang.srt.layers.vocab_parallel_embedding import (
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+tp_size = get_tensor_model_parallel_world_size()
+tp_rank = get_tensor_model_parallel_rank()
 def gate_up_proj_weight_loader(
     self,
     param: Parameter,
     loaded_weight: torch.Tensor,
-    loaded_shard_id: Optional[int] = None,
+    loaded_shard_id: int,
 ):
-    if loaded_shard_id is None:
-        shard_offsets: List[Tuple[int, int, int]] = []
-        for i, output_size in enumerate(self.output_sizes):
-            shard_offsets.append((i, current_shard_offset, output_size))
-            current_shard_offset += output_size
-        for shard_id, shard_offset, shard_size in shard_offsets:
-            loaded_weight_shard = loaded_weight.narrow(
-                output_dim, shard_offset, shard_size
-            )
-            self.weight_loader(param, loaded_weight_shard, shard_id)
-    else:
-        assert loaded_shard_id < len(self.output_sizes)
-        param_data = param.data
-        shard_size = loaded_weight.shape[0]
-        shard_offset = loaded_shard_id * shard_size
-        param_data = param_data.narrow(0, shard_offset, shard_size)
-        assert param_data.shape == loaded_weight.shape
-        param_data.copy_(loaded_weight)
-    return
+    # shard_id: (shard_offset, shard_size)
+    gate_up_offsets = {}
+    current_shard_offset = 0
+    for i, output_size in enumerate(self.output_sizes):
+        # Everything shrinks by tp_size if TP enabled
+        output_size = output_size // tp_size
+        gate_up_offsets[i] = (current_shard_offset, output_size)
+        current_shard_offset += output_size
+    # Re-size the param to the size after TP
+    if current_shard_offset != param.shape[0]:
+        # The clone will free the original, full tensor
+        param.data = param.data.narrow(0, 0, current_shard_offset).clone()
+    # Now load gate or up
+    assert loaded_shard_id < len(self.output_sizes)
+    param_data = param.data
+    shard_offset, shard_size = gate_up_offsets[loaded_shard_id]
+    param_data = param_data.narrow(0, shard_offset, shard_size)
+    loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
 class LlamaMLP(nn.Module):
+    _tp_plan = {
+        "gate_up_proj": "Colwise_Sharded",
+        "down_proj": "Rowwise",
+    }
     def __init__(
         self,
         hidden_size: int,
@@ -104,62 +140,44 @@ class LlamaMLP(nn.Module):
         return x
-def _get_shard_offset_mapping(self, loaded_shard_id: str):
-    shard_offset_mapping = {
-        "q": 0,
-        "k": self.num_heads * self.head_size,
-        "v": (self.num_heads + self.num_kv_heads) * self.head_size,
-        "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
-    }
-    return shard_offset_mapping.get(loaded_shard_id)
-def _get_shard_size_mapping(self, loaded_shard_id: str):
-    shard_size_mapping = {
-        "q": self.num_heads * self.head_size,
-        "k": self.num_kv_heads * self.head_size,
-        "v": self.num_kv_heads * self.head_size,
-    }
-    return shard_size_mapping.get(loaded_shard_id)
 def qkv_proj_weight_loader(
     self,
     param: Parameter,
     loaded_weight: torch.Tensor,
-    loaded_shard_id: Optional[str] = None,
+    loaded_shard_id: str,
 ):
-    if loaded_shard_id is None:
-        shard_offsets = [
-            # (shard_id, shard_offset, shard_size)
-            ("q", 0, self.total_num_heads * self.head_size),
-            (
-                "k",
-                self.total_num_heads * self.head_size,
-                self.total_num_kv_heads * self.head_size,
-            ),
-            (
-                "v",
-                (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
-                self.total_num_kv_heads * self.head_size,
-            ),
-        ]
-        for shard_id, shard_offset, shard_size in shard_offsets:
-            loaded_weight_shard = loaded_weight.narrow(
-                param.output_dim, shard_offset, shard_size
-            )
-            self.weight_loader(param, loaded_weight_shard, shard_id)
-    else:
-        shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
-        shard_size = self._get_shard_size_mapping(loaded_shard_id)
-        param_data = param.data
-        param_data = param_data.narrow(0, shard_offset, shard_size)
-        assert param_data.shape == loaded_weight.shape
-        param_data.copy_(loaded_weight)
-    return
+    num_heads = self.num_heads // tp_size
+    num_kv_heads = self.num_kv_heads // tp_size
+    # shard_id: (shard_offset, shard_size)
+    qkv_offsets = {
+        "q": (0, num_heads * self.head_size),
+        "k": (num_heads * self.head_size, num_kv_heads * self.head_size),
+        "v": (
+            (num_heads + num_kv_heads) * self.head_size,
+            num_kv_heads * self.head_size,
+        ),
+    }
+    total_size = qkv_offsets["v"][0] + qkv_offsets["v"][1]
+    # Re-size the param to the size after TP
+    if total_size != param.shape[0]:
+        # The clone will free the original, full tensor
+        param.data = param.data.narrow(0, 0, total_size).clone()
+    # Now load q, k or v
+    shard_offset, shard_size = qkv_offsets[loaded_shard_id]
+    param_data = param.data
+    param_data = param_data.narrow(0, shard_offset, shard_size)
+    loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
 class LlamaAttention(nn.Module):
+    _tp_plan = {
+        "qkv_proj": "Colwise_Sharded",
+        "o_proj": "Rowwise",
+    }
     def __init__(
         self,
         config: LlamaConfig,
@@ -176,7 +194,6 @@ class LlamaAttention(nn.Module):
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
         self.total_num_heads = num_heads
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
@@ -205,20 +222,12 @@ class LlamaAttention(nn.Module):
             (self.total_num_heads + 2 * self.total_num_kv_heads) * self.head_dim,
             bias=False,
         )
-        self.qkv_proj.total_num_heads = self.total_num_heads
         self.qkv_proj.head_size = self.head_dim
-        self.qkv_proj.total_num_kv_heads = self.total_num_kv_heads
         self.qkv_proj.num_heads = self.total_num_heads
         self.qkv_proj.num_kv_heads = self.total_num_kv_heads
         self.qkv_proj.weight_loader = types.MethodType(
             qkv_proj_weight_loader, self.qkv_proj
         )
-        self.qkv_proj._get_shard_offset_mapping = types.MethodType(
-            _get_shard_offset_mapping, self.qkv_proj
-        )
-        self.qkv_proj._get_shard_size_mapping = types.MethodType(
-            _get_shard_size_mapping, self.qkv_proj
-        )
         self.qkv_proj.weight.weight_loader = self.qkv_proj.weight_loader
         self.qkv_proj.weight.output_dim = 0
         self.o_proj = torch.nn.Linear(
@@ -385,10 +394,15 @@ class TorchNativeLlamaForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.torchao_config = global_server_args_dict["torchao_config"]
+        self.supports_torch_tp = True
         self.model = LlamaModel(config, quant_config=quant_config)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config)
+        # turning off autotune for fp8dq since it doesn't give speedup and
+        # increases compile time significantly
+        torch._inductor.config.max_autotune_gemm_backends = "ATEN"
     @torch.no_grad()
     def forward(
         self,

sglang/srt/models/xverse.py CHANGED Viewed

@@ -1,17 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/xverse.py#L1

sglang/srt/models/xverse_moe.py CHANGED Viewed

@@ -1,19 +1,18 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Inference-only XVERSE MoE model."""
 from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
@@ -25,7 +24,6 @@ from vllm.distributed import (
     tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -36,6 +34,7 @@ from vllm.model_executor.layers.linear import (
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from sglang.srt.layers.fused_moe_triton import fused_moe
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention

sglang/srt/models/yivl.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Inference-only Yi-VL model."""
 from typing import Iterable, Optional, Tuple

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Conversion between OpenAI APIs and native SRT APIs"""
 import asyncio
@@ -989,11 +987,15 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
                 output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
             )
             token_logprobs = []
-            for token, logprob in zip(logprobs.tokens, logprobs.token_logprobs):
+            for token_idx, (token, logprob) in enumerate(
+                zip(logprobs.tokens, logprobs.token_logprobs)
+            ):
                 token_bytes = list(token.encode("utf-8"))
                 top_logprobs = []
                 if logprobs.top_logprobs:
-                    for top_token, top_logprob in logprobs.top_logprobs[0].items():
+                    for top_token, top_logprob in logprobs.top_logprobs[
+                        token_idx
+                    ].items():
                         top_token_bytes = list(top_token.encode("utf-8"))
                         top_logprobs.append(
                             TopLogprob(

sglang/srt/openai_api/protocol.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Pydantic models for OpenAI API protocol"""
 import time
@@ -236,7 +234,7 @@ ChatCompletionMessageContentPart = Union[
 class ChatCompletionMessageGenericParam(BaseModel):
-    role: Literal["system", "assistant"]
+    role: Literal["system", "assistant", "tool"]
     content: Union[str, List[ChatCompletionMessageContentTextPart]]

sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl