PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/configs/model_config.py +6 -0
sglang/srt/conversation.py +6 -0
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +196 -51
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +18 -13
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +128 -43
sglang/srt/disaggregation/utils.py +127 -123
sglang/srt/entrypoints/engine.py +15 -1
sglang/srt/entrypoints/http_server.py +13 -2
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/layers/activation.py +19 -0
sglang/srt/layers/attention/aiter_backend.py +15 -2
sglang/srt/layers/attention/cutlass_mla_backend.py +38 -15
sglang/srt/layers/attention/flashattention_backend.py +53 -64
sglang/srt/layers/attention/flashinfer_backend.py +1 -2
sglang/srt/layers/attention/flashinfer_mla_backend.py +22 -24
sglang/srt/layers/attention/flashmla_backend.py +2 -10
sglang/srt/layers/attention/triton_backend.py +119 -119
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +23 -5
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +0 -12
sglang/srt/layers/moe/ep_moe/kernels.py +6 -5
sglang/srt/layers/moe/ep_moe/layer.py +42 -32
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -4
sglang/srt/layers/moe/topk.py +16 -8
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/lora/lora_manager.py +79 -34
sglang/srt/lora/mem_pool.py +4 -5
sglang/srt/managers/cache_controller.py +2 -1
sglang/srt/managers/io_struct.py +28 -4
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +39 -6
sglang/srt/managers/scheduler.py +73 -17
sglang/srt/managers/tokenizer_manager.py +29 -2
sglang/srt/mem_cache/chunk_cache.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +4 -2
sglang/srt/mem_cache/memory_pool.py +111 -407
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +36 -12
sglang/srt/model_executor/cuda_graph_runner.py +122 -55
sglang/srt/model_executor/forward_batch_info.py +14 -5
sglang/srt/model_executor/model_runner.py +6 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_v2.py +113 -155
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/openai_api/adapter.py +162 -4
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +318 -233
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -3
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +5 -2
sglang/srt/speculative/eagle_utils.py +389 -109
sglang/srt/speculative/eagle_worker.py +134 -43
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +58 -0
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +3 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +5 -5
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +99 -88
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0

sglang/srt/models/vila.py ADDED Viewed

@@ -0,0 +1,305 @@
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, cast
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.siglip import SiglipVisionConfig, SiglipVisionModel
+import sglang.srt.managers.mm_utils as mm_utils
+import sglang.srt.model_loader.weight_utils as weight_utils
+import sglang.srt.utils as utils
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import MultiModalityDataPaddingPatternMultimodalTokens
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+logger = logging.getLogger(__name__)
+##### BEGIN COPY configuration.py #####
+class VILAConfig(PretrainedConfig):
+    # Class attributes.
+    model_type: str = "vila"
+    sub_configs: Dict[str, PretrainedConfig] = {
+        "text_config": Qwen2Config(),
+        "vision_config": SiglipVisionConfig(),
+    }
+    _auto_class: Optional[str] = "AutoConfig"
+    # Configuration for sub-modules.
+    text_config: Qwen2Config = Qwen2Config()
+    vision_config: SiglipVisionConfig = SiglipVisionConfig()
+    # Model configuration.
+    hidden_size: int
+    image_token_id: int
+    mm_hidden_size: int
+    mm_projector_type: str
+    mm_vision_select_feature: str
+    mm_vision_select_layer: int
+    video_token_id: int
+    def __init__(
+        self,
+        text_config: Optional[Dict[str, Any]] = None,
+        vision_config: Optional[Dict[str, Any]] = None,
+        *,
+        hidden_size: int = 1536,
+        image_token_id: int = 151649,
+        mm_hidden_size: int = 1152,
+        mm_projector_type: str = "mlp_downsample_3x3_fix",
+        mm_vision_select_feature: str = "cls_patch",
+        mm_vision_select_layer: int = -2,
+        video_token_id: int = 151650,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.text_config = Qwen2Config(**text_config) if text_config else Qwen2Config()
+        self.vision_config = (
+            SiglipVisionConfig(**vision_config)
+            if vision_config
+            else SiglipVisionConfig()
+        )
+        self.hidden_size = hidden_size
+        self.image_token_id = image_token_id
+        self.mm_hidden_size = mm_hidden_size
+        self.mm_projector_type = mm_projector_type
+        self.mm_vision_select_feature = mm_vision_select_feature
+        self.mm_vision_select_layer = mm_vision_select_layer
+        self.video_token_id = video_token_id
+##### END COPY configuration.py #####
+##### BEGIN COPY modeling_vila.py #####
+class DownSample3x3BlockFix(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: The input tensor of shape (batch_size, sequence_length, mm_hidden_size).
+        Returns:
+            The output tensor of shape (batch_size, image_pad_len, mm_hidden_size * 9).
+        """
+        batch_size, sequence_length, hidden_size = x.shape
+        feat_size = int(sequence_length**0.5)
+        if feat_size**2 != sequence_length:
+            raise ValueError(
+                f"Cannot take square root: sequence_length {sequence_length} is not a perfect square"
+            )
+        features = x.reshape(batch_size, feat_size, feat_size, hidden_size)
+        pad_after = (3 - feat_size % 3) % 3
+        if pad_after > 0:
+            features = F.pad(features, (0, 0, 0, pad_after, 0, pad_after))
+            feat_size = feat_size + pad_after
+        features = features.reshape(
+            batch_size, feat_size // 3, 3, feat_size // 3, 3, hidden_size
+        )
+        features = features.permute(0, 1, 3, 2, 4, 5).contiguous()
+        features = features.reshape(batch_size, -1, 9 * hidden_size)
+        return features
+class MultimodalProjector(nn.Module):
+    layers: nn.Sequential
+    def __init__(
+        self,
+        config: VILAConfig,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        if config.mm_projector_type == "mlp_downsample_3x3_fix":
+            self.layers = nn.Sequential(
+                DownSample3x3BlockFix(),
+                nn.LayerNorm(config.mm_hidden_size * 9),
+                nn.Linear(
+                    config.mm_hidden_size * 9,
+                    config.mm_hidden_size * 3,
+                ),
+                nn.GELU(),
+                nn.LayerNorm(config.vision_config.hidden_size * 3),
+                nn.Linear(config.vision_config.hidden_size * 3, config.hidden_size),
+                nn.GELU(),
+                nn.Linear(config.hidden_size, config.hidden_size),
+            )
+        else:
+            raise NotImplementedError(
+                f"Unsupported mm_projector_type: {config.mm_projector_type}"
+            )
+        self.layers.type(config.torch_dtype)
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: The input tensor of shape (batch_size, sequence_length, mm_hidden_size).
+        Returns:
+            The output tensor of shape (batch_size, image_pad_len, hidden_size).
+        """
+        return self.layers(x.to(device=self.device, dtype=self.dtype))
+##### END COPY modeling_vila.py #####
+class VILAForConditionalGeneration(nn.Module):
+    config: VILAConfig
+    quant_config: Optional[QuantizationConfig]
+    logits_processor: LogitsProcessor
+    pooler: Pooler
+    llm: Qwen2ForCausalLM
+    mm_projector: MultimodalProjector
+    vision_tower: SiglipVisionModel
+    def __init__(
+        self,
+        config: VILAConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.llm = Qwen2ForCausalLM(
+            config=config.text_config,
+            quant_config=quant_config,
+            prefix=utils.add_prefix("llm", prefix),
+        )
+        self.mm_projector = MultimodalProjector(config)
+        self.vision_tower = SiglipVisionModel(config.vision_config)
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.config.torch_dtype
+    def forward(
+        self,
+        input_ids: Tensor,
+        positions: Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        output = mm_utils.general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm,
+            image_data_embedding_func=self.get_image_feature,
+            get_embedding=get_embedding,
+            positions=positions,
+        )
+        return cast(LogitsProcessorOutput, output)
+    def get_image_feature(self, mm_input: List[MultimodalDataItem]) -> Tensor:
+        pixel_values = cast(Tensor, mm_input[0].pixel_values)
+        ##### BEGIN COPY modeling_vila.py #####
+        vision_tower_output: BaseModelOutputWithPooling = self.vision_tower.__call__(
+            pixel_values.to(
+                device=self.vision_tower.device, dtype=self.vision_tower.dtype
+            ),
+            output_hidden_states=True,
+        )
+        mm_projector_input = self._vision_tower_output_to_mm_projector_input(
+            vision_tower_output
+        )
+        image_embedding: Tensor = self.mm_projector.__call__(
+            mm_projector_input.to(
+                device=self.mm_projector.device, dtype=self.mm_projector.dtype
+            )
+        )
+        ##### END COPY modeling_vila.py #####
+        return image_embedding
+    def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if name.startswith("llm."):
+                self.llm.load_weights([(name[len("llm.") :], loaded_weight)])
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param, "weight_loader", weight_utils.default_weight_loader
+                )
+                weight_loader(param, loaded_weight)
+    def pad_input_ids(
+        self,
+        input_ids: List[int],
+        image_inputs: MultimodalInputs,
+    ) -> List[int]:
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens(
+            token_ids=[self.config.image_token_id],
+        )
+        return pattern.pad_input_tokens(input_ids, image_inputs)
+    ##### BEGIN COPY modeling_vila.py #####
+    def _vision_tower_output_to_mm_projector_input(
+        self,
+        vision_tower_output: BaseModelOutputWithPooling,
+    ) -> Tensor:
+        assert vision_tower_output.hidden_states is not None
+        selected_layer_hidden_states = vision_tower_output.hidden_states[
+            self.config.mm_vision_select_layer
+        ]
+        if self.config.mm_vision_select_feature == "cls_patch":
+            return selected_layer_hidden_states
+        else:
+            raise NotImplementedError(
+                f"Unsupported mm_vision_select_feature: {self.config.mm_vision_select_feature}"
+            )
+    ##### END COPY modeling_vila.py #####
+EntryClass = [VILAForConditionalGeneration]

sglang/srt/openai_api/adapter.py CHANGED Viewed

@@ -41,7 +41,11 @@ from sglang.srt.conversation import (
     register_conv_template,
 )
 from sglang.srt.function_call.function_call_parser import FunctionCallParser
-from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
+from sglang.srt.managers.io_struct import (
+    EmbeddingReqInput,
+    GenerateReqInput,
+    V1RerankReqInput,
+)
 from sglang.srt.openai_api.protocol import (
     BatchRequest,
     BatchResponse,
@@ -69,6 +73,7 @@ from sglang.srt.openai_api.protocol import (
     FunctionResponse,
     LogProbs,
     MultimodalEmbeddingInput,
+    RerankResponse,
     ScoringRequest,
     ScoringResponse,
     ToolCall,
@@ -542,6 +547,7 @@ def v1_generate_request(
     logprob_start_lens = []
     top_logprobs_nums = []
     lora_paths = []
+    return_hidden_states = []
     for request in all_requests:
         # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -581,6 +587,7 @@ def v1_generate_request(
                 "no_stop_trim": request.no_stop_trim,
                 "ignore_eos": request.ignore_eos,
                 "skip_special_tokens": request.skip_special_tokens,
+                "logit_bias": request.logit_bias,
             }
         )
         return_logprobs.append(request.logprobs is not None)
@@ -588,6 +595,7 @@ def v1_generate_request(
         top_logprobs_nums.append(
             request.logprobs if request.logprobs is not None else 0
         )
+        return_hidden_states.append(request.return_hidden_states)
     if len(all_requests) == 1:
         if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
@@ -599,6 +607,7 @@ def v1_generate_request(
         logprob_start_lens = logprob_start_lens[0]
         top_logprobs_nums = top_logprobs_nums[0]
         lora_paths = lora_paths[0]
+        return_hidden_states = return_hidden_states[0]
     else:
         if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
             prompt_kwargs = {"text": prompts}
@@ -615,6 +624,7 @@ def v1_generate_request(
         stream=all_requests[0].stream,
         rid=request_ids,
         lora_path=lora_paths,
+        return_hidden_states=return_hidden_states,
         bootstrap_host=all_requests[0].bootstrap_host,
         bootstrap_port=all_requests[0].bootstrap_port,
         bootstrap_room=all_requests[0].bootstrap_room,
@@ -683,6 +693,16 @@ def v1_generate_response(
         else:
             logprobs = None
+        hidden_states = None
+        if isinstance(request, list) and request[idx].return_hidden_states:
+            hidden_states = ret_item["meta_info"].get("hidden_states", None)
+        elif (not isinstance(request, list)) and request.return_hidden_states:
+            hidden_states = ret_item["meta_info"].get("hidden_states", None)
+        if hidden_states is not None:
+            hidden_states = (
+                hidden_states[-1] if hidden_states and len(hidden_states) > 1 else []
+            )
         finish_reason = ret_item["meta_info"]["finish_reason"]
         if to_file:
@@ -698,6 +718,8 @@ def v1_generate_response(
                     else None
                 ),
             }
+            if hidden_states is not None:
+                choice_data["hidden_states"] = hidden_states
         else:
             choice_data = CompletionResponseChoice(
                 index=idx,
@@ -709,6 +731,7 @@ def v1_generate_response(
                     if finish_reason and "matched" in finish_reason
                     else None
                 ),
+                hidden_states=hidden_states,
             )
         choices.append(choice_data)
@@ -777,6 +800,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
             prompt_tokens = {}
             completion_tokens = {}
             cached_tokens = {}
+            hidden_states = {}
             try:
                 async for content in tokenizer_manager.generate_request(
@@ -791,6 +815,9 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
                     prompt_tokens[index] = content["meta_info"]["prompt_tokens"]
                     completion_tokens[index] = content["meta_info"]["completion_tokens"]
                     cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
+                    hidden_states[index] = content["meta_info"].get(
+                        "hidden_states", None
+                    ) or hidden_states.get(index)
                     if not stream_buffer:  # The first chunk
                         if request.echo:
@@ -873,6 +900,27 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
                     n_prev_tokens[index] = n_prev_token
                     yield f"data: {chunk.model_dump_json()}\n\n"
+                if request.return_hidden_states and hidden_states:
+                    for index, choice_hidden_states in hidden_states.items():
+                        last_token_hidden_states = (
+                            choice_hidden_states[-1]
+                            if choice_hidden_states and len(choice_hidden_states) > 1
+                            else []
+                        )
+                        hidden_states_chunk = CompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=created,
+                            choices=[
+                                CompletionResponseStreamChoice(
+                                    text="",
+                                    index=index,
+                                    hidden_states=last_token_hidden_states,
+                                    finish_reason=None,
+                                )
+                            ],
+                            model=request.model,
+                        )
+                        yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
                 if request.stream_options and request.stream_options.include_usage:
                     total_prompt_tokens = sum(
                         tokens
@@ -973,6 +1021,7 @@ def v1_chat_generate_request(
     top_logprobs_nums = []
     modalities_list = []
     lora_paths = []
+    return_hidden_states = []
     # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -1176,6 +1225,7 @@ def v1_chat_generate_request(
             "no_stop_trim": request.no_stop_trim,
             "ignore_eos": request.ignore_eos,
             "skip_special_tokens": request.skip_special_tokens,
+            "logit_bias": request.logit_bias,
         }
         if request.response_format and request.response_format.type == "json_schema":
@@ -1215,6 +1265,7 @@ def v1_chat_generate_request(
         image_data_list.append(image_data)
         audio_data_list.append(audio_data)
         modalities_list.append(modalities)
+        return_hidden_states.append(request.return_hidden_states)
     if len(all_requests) == 1:
         if is_multimodal:
             # processor will need text input
@@ -1233,6 +1284,7 @@ def v1_chat_generate_request(
         modalities_list = modalities_list[0]
         lora_paths = lora_paths[0]
         request_ids = request_ids[0]
+        return_hidden_states = return_hidden_states[0]
     else:
         if tokenizer_manager.model_config.is_multimodal:
             # processor will need text input
@@ -1259,6 +1311,7 @@ def v1_chat_generate_request(
         bootstrap_host=all_requests[0].bootstrap_host,
         bootstrap_port=all_requests[0].bootstrap_port,
         bootstrap_room=all_requests[0].bootstrap_room,
+        return_hidden_states=return_hidden_states,
     )
     return adapted_request, all_requests if len(all_requests) > 1 else all_requests[0]
@@ -1319,6 +1372,20 @@ def v1_chat_generate_response(
         else:
             choice_logprobs = None
+        if isinstance(request, list) and request[idx].return_hidden_states:
+            include_hidden_states = True
+        elif not isinstance(request, list) and request.return_hidden_states:
+            include_hidden_states = True
+        else:
+            include_hidden_states = False
+        if include_hidden_states and ret_item["meta_info"].get("hidden_states", None):
+            hidden_states = ret_item["meta_info"]["hidden_states"]
+            hidden_states = (
+                hidden_states[-1] if hidden_states and len(hidden_states) > 1 else []
+            )
+        else:
+            hidden_states = None
         finish_reason = ret_item["meta_info"]["finish_reason"]
         tool_calls = None
@@ -1391,6 +1458,8 @@ def v1_chat_generate_response(
                     else None
                 ),
             }
+            if hidden_states is not None:
+                choice_data["hidden_states"] = hidden_states
         else:
             choice_data = ChatCompletionResponseChoice(
                 index=idx,
@@ -1407,6 +1476,7 @@ def v1_chat_generate_response(
                     if finish_reason and "matched" in finish_reason
                     else None
                 ),
+                hidden_states=hidden_states,
             )
         choices.append(choice_data)
@@ -1479,19 +1549,23 @@ async def v1_chat_completions(
         reasoning_parser_dict = {}
         async def generate_stream_resp():
-            tool_call_first = True
+            tool_index_previous = -1
             is_firsts = {}
             stream_buffers = {}
             n_prev_tokens = {}
             prompt_tokens = {}
             completion_tokens = {}
             cached_tokens = {}
+            hidden_states = {}
             try:
                 async for content in tokenizer_manager.generate_request(
                     adapted_request, raw_request
                 ):
                     index = content.get("index", 0)
                     text = content["text"]
+                    hidden_states[index] = content["meta_info"].get(
+                        "hidden_states", None
+                    ) or hidden_states.get(index)
                     is_first = is_firsts.get(index, True)
                     stream_buffer = stream_buffers.get(index, "")
@@ -1613,6 +1687,7 @@ async def v1_chat_completions(
                         if (delta and len(delta) == 0) or not delta:
                             stream_buffers[index] = new_stream_buffer
                             is_firsts[index] = is_first
+                            n_prev_tokens[index] = n_prev_token
                             continue
                     if request.tool_choice != "none" and request.tools:
@@ -1645,6 +1720,7 @@ async def v1_chat_completions(
                         # 2) if we found calls, we output them as separate chunk(s)
                         for call_item in calls:
+                            tool_index_current = call_item.tool_index
                             # transform call_item -> FunctionResponse + ToolCall
                             if finish_reason_type == "stop":
                                 latest_delta_len = 0
@@ -1671,7 +1747,7 @@ async def v1_chat_completions(
                             tool_call = ToolCall(
                                 id=(
                                     f"call_{base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode()}"
-                                    if tool_call_first
+                                    if tool_index_previous != tool_index_current
                                     else None
                                 ),
                                 index=call_item.tool_index,
@@ -1680,7 +1756,7 @@ async def v1_chat_completions(
                                     arguments=call_item.parameters,
                                 ),
                             )
-                            tool_call_first = False
+                            tool_index_previous = tool_index_current
                             choice_data = ChatCompletionResponseStreamChoice(
                                 index=index,
                                 delta=DeltaMessage(tool_calls=[tool_call]),
@@ -1701,6 +1777,7 @@ async def v1_chat_completions(
                         stream_buffers[index] = new_stream_buffer
                         is_firsts[index] = is_first
+                        n_prev_tokens[index] = n_prev_token
                     else:
                         # No tool calls => just treat this as normal text
@@ -1733,6 +1810,7 @@ async def v1_chat_completions(
                             yield f"data: {chunk.model_dump_json()}\n\n"
                             stream_buffers[index] = new_stream_buffer
                             is_firsts[index] = is_first
+                            n_prev_tokens[index] = n_prev_token
                 if finish_reason_type == "stop" and request.tool_choice != "none":
                     parser = FunctionCallParser(
                         tools=request.tools,
@@ -1768,6 +1846,28 @@ async def v1_chat_completions(
                 else:
                     usage = None
+                if request.return_hidden_states and hidden_states:
+                    for index, choice_hidden_states in hidden_states.items():
+                        last_token_hidden_states = (
+                            choice_hidden_states[-1]
+                            if choice_hidden_states and len(choice_hidden_states) > 1
+                            else []
+                        )
+                        hidden_states_chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=created,
+                            choices=[
+                                ChatCompletionResponseStreamChoice(
+                                    index=index,
+                                    delta=DeltaMessage(
+                                        hidden_states=last_token_hidden_states
+                                    ),
+                                    finish_reason=finish_reason_type,
+                                )
+                            ],
+                            model=request.model,
+                        )
+                        yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
                 final_usage_chunk = ChatCompletionStreamResponse(
                     id=content["meta_info"]["id"],
                     created=created,
@@ -1925,6 +2025,64 @@ async def v1_embeddings(tokenizer_manager, raw_request: Request):
     return response
+def v1_rerank_request(obj: V1RerankReqInput):
+    if obj.query is None:
+        raise ValueError("query is required")
+    if obj.documents is None or len(obj.documents) == 0:
+        raise ValueError("documents is required")
+    pairs = []
+    for doc in obj.documents:
+        pairs.append([obj.query, doc])
+    adapted_request = EmbeddingReqInput(
+        text=pairs,
+        is_cross_encoder_request=True,
+    )
+    return adapted_request
+def v1_rerank_response(ret, obj: V1RerankReqInput):
+    response = []
+    for idx, ret_item in enumerate(ret):
+        response.append(
+            RerankResponse(
+                score=ret[idx]["embedding"],
+                document=obj.documents[idx],
+                index=idx,
+                meta_info=ret[idx]["meta_info"],
+            )
+        )
+    response.sort(key=lambda x: x.score, reverse=True)
+    return response
+async def v1_rerank(tokenizer_manager, obj: V1RerankReqInput, raw_request: Request):
+    adapted_request = v1_rerank_request(obj)
+    try:
+        ret = await tokenizer_manager.generate_request(
+            adapted_request, raw_request
+        ).__anext__()
+    except ValueError as e:
+        return create_error_response(str(e))
+    if not isinstance(ret, list):
+        ret = [ret]
+    response = v1_rerank_response(
+        ret,
+        obj,
+    )
+    return response
 def to_openai_style_logprobs(
     input_token_logprobs=None,
     output_token_logprobs=None,

sglang 0.4.7__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.7.post1py3-none-any.whl