PyPI - sglang - Versions diffs - 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

sglang/bench_serving.py +23 -3
sglang/srt/configs/deepseekvl2.py +10 -1
sglang/srt/configs/model_config.py +5 -16
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +32 -5
sglang/srt/entrypoints/http_server.py +7 -1
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +0 -1
sglang/srt/layers/attention/flashattention_backend.py +218 -79
sglang/srt/layers/dp_attention.py +12 -1
sglang/srt/layers/moe/topk.py +30 -3
sglang/srt/layers/quantization/__init__.py +134 -165
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/fp8_kernel.py +2 -1
sglang/srt/layers/quantization/gptq.py +30 -40
sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +12 -0
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +19 -33
sglang/srt/lora/lora_manager.py +20 -7
sglang/srt/lora/mem_pool.py +12 -6
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +6 -0
sglang/srt/managers/io_struct.py +4 -2
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/schedule_batch.py +1 -0
sglang/srt/managers/scheduler.py +25 -19
sglang/srt/managers/tokenizer_manager.py +0 -1
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +9 -8
sglang/srt/model_executor/model_runner.py +9 -6
sglang/srt/model_loader/loader.py +11 -1
sglang/srt/model_loader/weight_utils.py +6 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +151 -26
sglang/srt/models/gemma3_causal.py +12 -2
sglang/srt/models/gemma3_mm.py +6 -0
sglang/srt/openai_api/adapter.py +88 -87
sglang/srt/openai_api/protocol.py +10 -5
sglang/srt/patch_torch.py +71 -0
sglang/srt/server_args.py +21 -11
sglang/srt/speculative/eagle_worker.py +1 -1
sglang/srt/utils.py +33 -0
sglang/test/runners.py +27 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +8 -4
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +57 -53
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post2.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -39,6 +39,8 @@ from sglang.srt.layers.dp_attention import (
     get_attention_dp_size,
     get_attention_tp_rank,
     get_attention_tp_size,
+    tp_all_gather,
+    tp_reduce_scatter,
 )
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
@@ -71,7 +73,7 @@ from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import add_prefix, is_cuda, is_cuda_available, is_hip
+from sglang.srt.utils import add_prefix, is_cuda, is_hip
 _is_hip = is_hip()
 _is_cuda = is_cuda()
@@ -278,7 +280,11 @@ class DeepseekV2MoE(nn.Module):
         topk_weights = torch.empty(
             (0, self.top_k), dtype=torch.float32, device=hidden_states.device
         )
-        if forward_mode is not None and not forward_mode.is_idle():
+        if (
+            forward_mode is not None
+            and not forward_mode.is_idle()
+            and hidden_states.shape[0] > 0
+        ):
             # router_logits: (num_tokens, n_experts)
             router_logits = self.gate(hidden_states)
             if self.n_shared_experts is not None:
@@ -649,6 +655,7 @@ class DeepseekV2AttentionMLA(nn.Module):
         self.flashinfer_mla_disable_ragged = global_server_args_dict[
             "flashinfer_mla_disable_ragged"
         ]
+        self.attention_backend = global_server_args_dict["attention_backend"]
         self.rocm_fused_decode_mla = os.getenv("SGLANG_ROCM_FUSED_DECODE_MLA") == "1"
     def no_absorb(self, forward_batch: ForwardBatch) -> bool:
@@ -661,6 +668,9 @@ class DeepseekV2AttentionMLA(nn.Module):
                 and not forward_batch.forward_mode.is_draft_extend()
                 and sum(forward_batch.extend_prefix_lens_cpu) == 0
             )
+        elif self.attention_backend == "fa3":
+            # Flash Attention: Keep absorbing for all extend/decode
+            return False
         else:
             # Triton: Use normal computation for prefill and use weight absorption for extend/decode
             return (
@@ -969,6 +979,14 @@ class DeepseekV2DecoderLayer(nn.Module):
         is_nextn: bool = False,
         prefix: str = "",
     ) -> None:
+        def is_sparse_layer(l: int):
+            return (
+                config.n_routed_experts is not None
+                and l >= config.first_k_dense_replace
+                and l % config.moe_layer_freq == 0
+            )
         super().__init__()
         self.hidden_size = config.hidden_size
         rope_theta = getattr(config, "rope_theta", 10000)
@@ -977,6 +995,8 @@ class DeepseekV2DecoderLayer(nn.Module):
         self.enable_dp_attention = global_server_args_dict["enable_dp_attention"]
         self.layer_id = layer_id
         self.dp_size = get_attention_dp_size()
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
         if not global_server_args_dict["disable_mla"]:
             self.self_attn = DeepseekV2AttentionMLA(
@@ -1019,16 +1039,13 @@ class DeepseekV2DecoderLayer(nn.Module):
                 prefix=add_prefix("self_attn", prefix),
             )
-        if is_nextn or (
-            config.n_routed_experts is not None
-            and layer_id >= config.first_k_dense_replace
-            and layer_id % config.moe_layer_freq == 0
-        ):
+        if is_nextn or is_sparse_layer(layer_id):
             self.mlp = DeepseekV2MoE(
                 config=config,
                 quant_config=quant_config,
                 prefix=add_prefix("mlp", prefix),
             )
+            self.is_sparse = True
         else:
             self.mlp = DeepseekV2MLP(
                 hidden_size=config.hidden_size,
@@ -1037,6 +1054,14 @@ class DeepseekV2DecoderLayer(nn.Module):
                 quant_config=quant_config,
                 prefix=add_prefix("mlp", prefix),
             )
+            self.is_sparse = False
+        self.input_is_scattered = (
+            is_sparse_layer(layer_id - 1)
+            and global_server_args_dict["enable_deepep_moe"]
+        )
+        self.is_last_layer = self.layer_id == config.num_hidden_layers - 1
         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps
@@ -1049,6 +1074,23 @@ class DeepseekV2DecoderLayer(nn.Module):
         forward_batch: ForwardBatch,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
+        if global_server_args_dict["enable_deepep_moe"] and self.is_sparse:
+            return self.forward_deepep(
+                positions, hidden_states, forward_batch, residual
+            )
+        else:
+            return self.forward_normal(
+                positions, hidden_states, forward_batch, residual
+            )
+    def forward_normal(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
         if hidden_states.shape[0] == 0:
             residual = hidden_states
         else:
@@ -1065,29 +1107,35 @@ class DeepseekV2DecoderLayer(nn.Module):
                 forward_batch=forward_batch,
             )
+        if self.attn_tp_size != 1 and self.input_is_scattered:
+            hidden_states, local_hidden_states = (
+                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+                hidden_states,
+            )
+            tp_all_gather(
+                list(hidden_states.tensor_split(self.attn_tp_size)), local_hidden_states
+            )
+            residual, local_residual = (
+                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+                residual,
+            )
+            tp_all_gather(
+                list(residual.tensor_split(self.attn_tp_size)), local_residual
+            )
         # Gather
         if get_tensor_model_parallel_world_size() > 1:
             # all gather and all reduce
             if self.dp_size != 1:
-                if global_server_args_dict["enable_deepep_moe"] and isinstance(
-                    self.mlp, DeepseekV2MoE
-                ):
-                    if hidden_states.shape[0] != 0:
-                        hidden_states, residual = self.post_attention_layernorm(
-                            hidden_states, residual
-                        )
-                    hidden_states = self.mlp(hidden_states, forward_batch.forward_mode)
-                    return hidden_states, residual
-                else:
-                    if get_attention_tp_rank() == 0:
-                        hidden_states += residual
-                    hidden_states, local_hidden_states = (
-                        forward_batch.gathered_buffer,
-                        hidden_states,
-                    )
-                    dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
-                    dp_scatter(residual, hidden_states, forward_batch)
-                    hidden_states = self.post_attention_layernorm(hidden_states)
+                if self.attn_tp_rank == 0:
+                    hidden_states += residual
+                hidden_states, local_hidden_states = (
+                    forward_batch.gathered_buffer,
+                    hidden_states,
+                )
+                dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
+                dp_scatter(residual, hidden_states, forward_batch)
+                hidden_states = self.post_attention_layernorm(hidden_states)
             else:
                 hidden_states = tensor_model_parallel_all_reduce(hidden_states)
                 hidden_states, residual = self.post_attention_layernorm(
@@ -1101,6 +1149,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         # Fully Connected
         hidden_states = self.mlp(hidden_states)
+        # TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
         # Scatter
         if self.dp_size != 1:
             # important: forward batch.gathered_buffer is used both after scatter and after gather.
@@ -1113,6 +1162,82 @@ class DeepseekV2DecoderLayer(nn.Module):
         return hidden_states, residual
+    def forward_deepep(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] == 0:
+            residual = hidden_states
+        else:
+            if residual is None:
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        if self.attn_tp_size != 1 and self.input_is_scattered:
+            hidden_states, local_hidden_states = (
+                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+                hidden_states,
+            )
+            tp_all_gather(
+                list(hidden_states.tensor_split(self.attn_tp_size)), local_hidden_states
+            )
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        if self.attn_tp_size != 1:
+            if self.input_is_scattered:
+                tensor_list = list(hidden_states.tensor_split(self.attn_tp_size))
+                hidden_states = tensor_list[self.attn_tp_rank]
+                tp_reduce_scatter(hidden_states, tensor_list)
+                if hidden_states.shape[0] != 0:
+                    hidden_states, residual = self.post_attention_layernorm(
+                        hidden_states, residual
+                    )
+            else:
+                if self.attn_tp_rank == 0:
+                    hidden_states += residual
+                tensor_list = list(hidden_states.tensor_split(self.attn_tp_size))
+                hidden_states = tensor_list[self.attn_tp_rank]
+                tp_reduce_scatter(hidden_states, tensor_list)
+                residual = hidden_states
+                if hidden_states.shape[0] != 0:
+                    hidden_states = self.post_attention_layernorm(hidden_states)
+        else:
+            if hidden_states.shape[0] != 0:
+                hidden_states, residual = self.post_attention_layernorm(
+                    hidden_states, residual
+                )
+        hidden_states = self.mlp(hidden_states, forward_batch.forward_mode)
+        if self.is_last_layer and self.attn_tp_size != 1:
+            hidden_states, local_hidden_states = (
+                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+                hidden_states,
+            )
+            tp_all_gather(
+                list(hidden_states.tensor_split(self.attn_tp_size)), local_hidden_states
+            )
+            residual, local_residual = (
+                forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+                residual,
+            )
+            tp_all_gather(
+                list(residual.tensor_split(self.attn_tp_size)), local_residual
+            )
+        return hidden_states, residual
 class DeepseekV2Model(nn.Module):

sglang/srt/models/gemma3_causal.py CHANGED Viewed

@@ -47,6 +47,12 @@ from sglang.srt.model_loader.weight_utils import (
 from sglang.srt.utils import add_prefix, make_layers
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
 # Adapted from:
 # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3.py
 def extract_layer_index(prefix: str) -> int:
@@ -170,7 +176,7 @@ class Gemma3Attention(nn.Module):
             self.rope_scaling = {"rope_type": "default"}
             # FIXME(mick): idk why vllm does this
             # self.sliding_window = config.interleaved_sliding_window
-            self.sliding_window = config.sliding_window
+            self.sliding_window = get_attention_sliding_window_size(config)
         else:
             # Global attention. Use the values in config.json.
             self.rope_theta = config.rope_theta
@@ -184,6 +190,8 @@ class Gemma3Attention(nn.Module):
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
             logit_cap=getattr(self.config, "attn_logit_softcapping", None),
+            # Module must also define `get_attention_sliding_window_size` to correctly initialize
+            # attention backend in `ForwardBatch`.
             sliding_window_size=self.sliding_window,
             prefix=add_prefix("attn", prefix),
         )
@@ -609,6 +617,9 @@ class Gemma3ForCausalLM(PreTrainedModel):
     def get_input_embeddings(self) -> nn.Embedding:
         return self.model.embed_tokens
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
     def dtype(self) -> torch.dtype:
         return next(self.parameters()).dtype
@@ -621,7 +632,6 @@ class Gemma3ForCausalLM(PreTrainedModel):
         input_embeds: torch.Tensor = None,
         **kwargs,
     ) -> LogitsProcessor:
         hidden_states = self.model(
             input_ids, positions, forward_batch, input_embeds, **kwargs
         )

sglang/srt/models/gemma3_mm.py CHANGED Viewed

@@ -268,6 +268,12 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
     def get_input_embeddings(self) -> nn.Embedding:
         return self.language_model.get_input_embeddings()
+    def get_attention_sliding_window_size(self):
+        """
+        This value is used to initialize attention backends in `ForwardBatch`.
+        """
+        return self.language_model.get_attention_sliding_window_size()
     def get_image_feature(self, image_input: MultimodalInputs):
         """
         Projects the last hidden state from the vision model into language model space.

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -20,7 +20,7 @@ import os
 import time
 import uuid
 from http import HTTPStatus
-from typing import Any, Dict, List, Set
+from typing import Dict, List
 from fastapi import HTTPException, Request, UploadFile
 from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -645,7 +645,7 @@ def v1_generate_response(
                 "index": 0,
                 "text": text,
                 "logprobs": logprobs,
-                "finish_reason": (finish_reason["type"] if finish_reason else ""),
+                "finish_reason": finish_reason["type"] if finish_reason else None,
                 "matched_stop": (
                     finish_reason["matched"]
                     if finish_reason and "matched" in finish_reason
@@ -657,7 +657,7 @@ def v1_generate_response(
                 index=idx,
                 text=text,
                 logprobs=logprobs,
-                finish_reason=(finish_reason["type"] if finish_reason else ""),
+                finish_reason=finish_reason["type"] if finish_reason else None,
                 matched_stop=(
                     finish_reason["matched"]
                     if finish_reason and "matched" in finish_reason
@@ -805,7 +805,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
                         index=index,
                         text=delta,
                         logprobs=logprobs,
-                        finish_reason=(finish_reason["type"] if finish_reason else ""),
+                        finish_reason=finish_reason["type"] if finish_reason else None,
                         matched_stop=(
                             finish_reason["matched"]
                             if finish_reason and "matched" in finish_reason
@@ -1119,7 +1119,9 @@ def v1_chat_generate_response(
         if logprobs:
             logprobs = to_openai_style_logprobs(
                 output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
-                output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
+                output_top_logprobs=ret_item["meta_info"].get(
+                    "output_top_logprobs", None
+                ),
             )
             token_logprobs = []
             for token_idx, (token, logprob) in enumerate(
@@ -1216,7 +1218,7 @@ def v1_chat_generate_response(
                     "reasoning_content": reasoning_text if reasoning_text else None,
                 },
                 "logprobs": choice_logprobs.model_dump() if choice_logprobs else None,
-                "finish_reason": (finish_reason["type"] if finish_reason else ""),
+                "finish_reason": finish_reason["type"] if finish_reason else None,
                 "matched_stop": (
                     finish_reason["matched"]
                     if finish_reason and "matched" in finish_reason
@@ -1233,7 +1235,7 @@ def v1_chat_generate_response(
                     reasoning_content=reasoning_text if reasoning_text else None,
                 ),
                 logprobs=choice_logprobs,
-                finish_reason=(finish_reason["type"] if finish_reason else ""),
+                finish_reason=finish_reason["type"] if finish_reason else None,
                 matched_stop=(
                     finish_reason["matched"]
                     if finish_reason and "matched" in finish_reason
@@ -1329,9 +1331,9 @@ async def v1_chat_completions(
                             output_token_logprobs=content["meta_info"][
                                 "output_token_logprobs"
                             ][n_prev_token:],
-                            output_top_logprobs=content["meta_info"][
-                                "output_top_logprobs"
-                            ][n_prev_token:],
+                            output_top_logprobs=content["meta_info"].get(
+                                "output_top_logprobs", []
+                            )[n_prev_token:],
                         )
                         n_prev_token = len(
@@ -1377,23 +1379,11 @@ async def v1_chat_completions(
                     if is_first:
                         # First chunk with role
                         is_first = False
-                        if (
-                            tokenizer_manager.server_args.reasoning_parser
-                            and request.separate_reasoning
-                        ):
-                            delta = DeltaMessage(
-                                role="assistant", reasoning_content=None
-                            )
-                        else:
-                            delta = DeltaMessage(role="assistant", content=None)
+                        delta = DeltaMessage(role="assistant")
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=index,
                             delta=delta,
-                            finish_reason=(
-                                None
-                                if finish_reason_type and len(finish_reason_type) == 0
-                                else finish_reason_type
-                            ),
+                            finish_reason=finish_reason_type,
                             matched_stop=(
                                 finish_reason["matched"]
                                 if finish_reason and "matched" in finish_reason
@@ -1434,12 +1424,7 @@ async def v1_chat_completions(
                                         reasoning_text if reasoning_text else None
                                     )
                                 ),
-                                finish_reason=(
-                                    None
-                                    if finish_reason_type
-                                    and len(finish_reason_type) == 0
-                                    else finish_reason_type
-                                ),
+                                finish_reason=finish_reason_type,
                             )
                             chunk = ChatCompletionStreamResponse(
                                 id=content["meta_info"]["id"],
@@ -1471,12 +1456,7 @@ async def v1_chat_completions(
                                 delta=DeltaMessage(
                                     content=normal_text if normal_text else None
                                 ),
-                                finish_reason=(
-                                    None
-                                    if finish_reason_type
-                                    and len(finish_reason_type) == 0
-                                    else finish_reason_type
-                                ),
+                                finish_reason=finish_reason_type,
                             )
                             chunk = ChatCompletionStreamResponse(
                                 id=content["meta_info"]["id"],
@@ -1490,11 +1470,7 @@ async def v1_chat_completions(
                         for call_item in calls:
                             # transform call_item -> FunctionResponse + ToolCall
-                            if (
-                                content["meta_info"]["finish_reason"]
-                                and content["meta_info"]["finish_reason"]["type"]
-                                == "stop"
-                            ):
+                            if finish_reason_type == "stop":
                                 latest_delta_len = 0
                                 if isinstance(call_item.parameters, str):
                                     latest_delta_len = len(call_item.parameters)
@@ -1515,6 +1491,8 @@ async def v1_chat_completions(
                                 )
                                 call_item.parameters = remaining_call
+                                finish_reason_type = "tool_calls"
                             tool_call = ToolCall(
                                 id=str(call_item.tool_index),
                                 function=FunctionResponse(
@@ -1524,10 +1502,13 @@ async def v1_chat_completions(
                             )
                             choice_data = ChatCompletionResponseStreamChoice(
                                 index=index,
-                                delta=DeltaMessage(
-                                    role="assistant", tool_calls=[tool_call]
-                                ),
-                                finish_reason="tool_call",
+                                delta=DeltaMessage(tool_calls=[tool_call]),
+                                finish_reason=(
+                                    None
+                                    if request.stream_options
+                                    and request.stream_options.include_usage
+                                    else finish_reason_type
+                                ),  # additional chunk will be return
                             )
                             chunk = ChatCompletionStreamResponse(
                                 id=content["meta_info"]["id"],
@@ -1542,30 +1523,44 @@ async def v1_chat_completions(
                     else:
                         # No tool calls => just treat this as normal text
-                        choice_data = ChatCompletionResponseStreamChoice(
-                            index=index,
-                            delta=DeltaMessage(content=delta if delta else None),
-                            finish_reason=(
-                                None
-                                if finish_reason_type and len(finish_reason_type) == 0
-                                else finish_reason_type
-                            ),
-                            matched_stop=(
-                                finish_reason["matched"]
-                                if finish_reason and "matched" in finish_reason
-                                else None
-                            ),
-                            logprobs=choice_logprobs,
-                        )
-                        chunk = ChatCompletionStreamResponse(
-                            id=content["meta_info"]["id"],
-                            created=created,
-                            choices=[choice_data],
-                            model=request.model,
-                        )
-                        yield f"data: {chunk.model_dump_json()}\n\n"
-                        stream_buffers[index] = new_stream_buffer
-                        is_firsts[index] = is_first
+                        if delta or not (
+                            request.stream_options
+                            and request.stream_options.include_usage
+                        ):
+                            choice_data = ChatCompletionResponseStreamChoice(
+                                index=index,
+                                delta=DeltaMessage(content=delta if delta else None),
+                                finish_reason=(
+                                    None
+                                    if request.stream_options
+                                    and request.stream_options.include_usage
+                                    else finish_reason_type
+                                ),
+                                matched_stop=(
+                                    finish_reason["matched"]
+                                    if finish_reason and "matched" in finish_reason
+                                    else None
+                                ),
+                                logprobs=choice_logprobs,
+                            )
+                            chunk = ChatCompletionStreamResponse(
+                                id=content["meta_info"]["id"],
+                                created=created,
+                                choices=[choice_data],
+                                model=request.model,
+                            )
+                            yield f"data: {chunk.model_dump_json()}\n\n"
+                            stream_buffers[index] = new_stream_buffer
+                            is_firsts[index] = is_first
+                if finish_reason_type == "stop" and request.tool_choice != "none":
+                    parser = FunctionCallParser(
+                        tools=request.tools,
+                        tool_call_parser=tokenizer_manager.server_args.tool_call_parser,
+                    )
+                    if parser.has_tool_call(new_stream_buffer):
+                        # if the stream ends with empty string after tool calls
+                        finish_reason_type = "tool_calls"
                 if request.stream_options and request.stream_options.include_usage:
                     total_prompt_tokens = sum(
                         tokens
@@ -1590,17 +1585,22 @@ async def v1_chat_completions(
                         prompt_tokens_details=prompt_tokens_details,
                     )
-                    final_usage_chunk = ChatCompletionStreamResponse(
-                        id=content["meta_info"]["id"],
-                        created=created,
-                        choices=[],
-                        model=request.model,
-                        usage=usage,
-                    )
-                    final_usage_data = final_usage_chunk.model_dump_json(
-                        exclude_none=True
-                    )
-                    yield f"data: {final_usage_data}\n\n"
+                else:
+                    usage = None
+                final_usage_chunk = ChatCompletionStreamResponse(
+                    id=content["meta_info"]["id"],
+                    created=created,
+                    choices=[
+                        ChatCompletionResponseStreamChoice(
+                            index=index,
+                            delta=DeltaMessage(),
+                            finish_reason=finish_reason_type,
+                        )
+                    ],
+                    model=request.model,
+                    usage=usage,
+                )
+                yield f"data: {final_usage_chunk.model_dump_json()}\n\n"
             except ValueError as e:
                 error = create_streaming_error_response(str(e))
                 yield f"data: {error}\n\n"
@@ -1653,18 +1653,19 @@ def v1_embedding_request(all_requests, tokenizer_manager):
         elif isinstance(prompt, list) and isinstance(
             prompt[0], MultimodalEmbeddingInput
         ):
-            assert (
-                chat_template_name is not None
-            ), "chat_template_name is required for multimodal inputs"
             texts = []
             images = []
             for item in prompt:
-                texts.append(item.text if item.text is not None else None)
+                # TODO simply use padding for text, we should use a better way to handle this
+                texts.append(item.text if item.text is not None else "padding")
                 images.append(item.image if item.image is not None else None)
-            convs = generate_embedding_convs(texts, images, chat_template_name)
             generate_prompts = []
-            for conv in convs:
-                generate_prompts.append(conv.get_prompt())
+            if chat_template_name is not None:
+                convs = generate_embedding_convs(texts, images, chat_template_name)
+                for conv in convs:
+                    generate_prompts.append(conv.get_prompt())
+            else:
+                generate_prompts = texts
             if len(generate_prompts) == 1:
                 prompt_kwargs = {"text": generate_prompts[0], "image_data": images[0]}
             else:

sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post2py3-none-any.whl → 0.4.4.post3py3-none-any.whl