PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (152) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_one_batch.py +8 -6
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/configs/model_config.py +6 -0
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +19 -3
sglang/srt/custom_op.py +5 -1
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +211 -72
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +30 -29
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +144 -55
sglang/srt/disaggregation/utils.py +155 -123
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +37 -29
sglang/srt/entrypoints/http_server.py +153 -72
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +921 -0
sglang/srt/entrypoints/openai/serving_completions.py +424 -0
sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +40 -3
sglang/srt/layers/attention/aiter_backend.py +20 -4
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
sglang/srt/layers/attention/flashattention_backend.py +71 -72
sglang/srt/layers/attention/flashinfer_backend.py +10 -8
sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
sglang/srt/layers/attention/flashmla_backend.py +7 -12
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +138 -130
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +28 -10
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +29 -2
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +2 -14
sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
sglang/srt/layers/moe/ep_moe/layer.py +249 -33
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
sglang/srt/layers/moe/topk.py +107 -12
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/layers/rotary_embedding.py +42 -2
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora_manager.py +249 -105
sglang/srt/lora/mem_pool.py +53 -50
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -14
sglang/srt/managers/io_struct.py +31 -10
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +79 -37
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +220 -79
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +40 -10
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -15
sglang/srt/mem_cache/hiradix_cache.py +38 -25
sglang/srt/mem_cache/memory_pool.py +213 -505
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +56 -28
sglang/srt/model_executor/cuda_graph_runner.py +198 -100
sglang/srt/model_executor/forward_batch_info.py +32 -10
sglang/srt/model_executor/model_runner.py +28 -12
sglang/srt/model_loader/loader.py +16 -2
sglang/srt/model_loader/weight_utils.py +11 -2
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_nextn.py +29 -27
sglang/srt/models/deepseek_v2.py +213 -173
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +351 -238
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
sglang/srt/speculative/eagle_utils.py +468 -116
sglang/srt/speculative/eagle_worker.py +258 -84
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +235 -11
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +2 -0
sglang/test/test_utils.py +4 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -1990
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Callable, Dict, List, Optional, Set, Tuple
 import torch
@@ -22,21 +22,16 @@ class LoRAMemoryPool:
         self,
         base_hf_config: AutoConfig,
         max_loras_per_batch: int,
-        max_lora_dim: int,
         dtype: torch.dtype,
         tp_size: int,
         tp_rank: int,
-        lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]],
     ):
         self.base_hf_config: AutoConfig = base_hf_config
         self.num_layer: int = base_hf_config.num_hidden_layers
         self.max_loras_per_batch: int = max_loras_per_batch
-        self.max_lora_dim: int = max_lora_dim
         self.dtype: torch.dtype = dtype
         self.tp_size: int = tp_size
         self.tp_rank: int = tp_rank
-        self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = lora_modules
         # Both A_buffer and B_buffer maps lora weight names to its buffer space.
         # A_buffer contains num_layer number of row-major tensors with shape
@@ -55,89 +50,95 @@ class LoRAMemoryPool:
         self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
     def get_lora_A_shape(
-        self, module_name: str, base_model: torch.nn.Module
+        self, module_name: str, base_model: torch.nn.Module, max_lora_dim: int
     ) -> Tuple[int]:
         """
         Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
         c = get_stacked_multiply(module_name)
-        if self.tp_size > 1:
-            if module_name in ROW_PARALLELISM_LINEAR_LORA_NAMES:
-                input_dim = divide(input_dim, self.tp_size)
+        if self.tp_size > 1 and module_name in ROW_PARALLELISM_LINEAR_LORA_NAMES:
+            input_dim = divide(input_dim, self.tp_size)
         return (
             self.max_loras_per_batch,
-            self.max_lora_dim * c,
+            max_lora_dim * c,
             input_dim,
         )
     def get_lora_B_shape(
-        self, module_name: str, base_model: torch.nn.Module
+        self, module_name: str, base_model: torch.nn.Module, max_lora_dim: int
     ) -> Tuple[int]:
         """
         Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
         c = get_stacked_multiply(module_name)
-        if self.tp_size > 1:
-            if module_name not in ROW_PARALLELISM_LINEAR_LORA_NAMES:
-                output_dim = divide(output_dim, self.tp_size)
+        if self.tp_size > 1 and module_name not in ROW_PARALLELISM_LINEAR_LORA_NAMES:
+            output_dim = divide(output_dim, self.tp_size)
         return (
             c,
             self.max_loras_per_batch,
             output_dim,
-            self.max_lora_dim,
+            max_lora_dim,
         )
     def init_buffers(
         self,
         lora_weight_names: Tuple[Set[str]],
         base_model: torch.nn.Module,
+        max_lora_dim: int,
     ):
         # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
         #   e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
         self.lora_weight_names: Tuple[Set[str]] = lora_weight_names
         device = next(base_model.parameters()).device
-        # Init A tensor, column_major=False
-        for module_A in lora_weight_names[0]:
-            lora_A_shape = self.get_lora_A_shape(module_A, base_model)
-            self.A_buffer[module_A] = [
-                torch.empty(
-                    lora_A_shape,
-                    dtype=self.dtype,
-                    device=device,
-                )
-                for _ in range(self.num_layer)
-            ]
-        # Init B tensor, column_major=True
-        for module_B in lora_weight_names[1]:
-            lora_B_shape = self.get_lora_B_shape(module_B, base_model)
-            self.B_buffer[module_B] = [
-                torch.empty(
-                    lora_B_shape,
-                    dtype=self.dtype,
-                    device=device,
-                )
-                for _ in range(self.num_layer)
-            ]
+        def update_buffer(
+            buffer: Dict[str, List[torch.Tensor]],
+            lora_weight_names: Set[str],
+            get_lora_shape_fn: Callable[[str, torch.nn.Module, int], Tuple[int]],
+        ):
+            new_weight_names = lora_weight_names - buffer.keys()
+            for module_name in new_weight_names:
+                lora_shape = get_lora_shape_fn(module_name, base_model, max_lora_dim)
+                buffer[module_name] = [
+                    torch.empty(
+                        lora_shape,
+                        dtype=self.dtype,
+                        device=device,
+                    )
+                    for _ in range(self.num_layer)
+                ]
+        update_buffer(
+            self.A_buffer,
+            lora_weight_names[0],
+            self.get_lora_A_shape,
+        )
+        update_buffer(
+            self.B_buffer,
+            lora_weight_names[1],
+            self.get_lora_B_shape,
+        )
     def prepare_lora_batch(
         self,
         cur_uids: Set[Optional[str]],
         lora_adapters: Dict[str, LoRAAdapter],
+        lora_modules: Dict[int, Dict[str, BaseLayerWithLoRA]],
     ):
         def get_available_buffer_slot():
             for buffer_id in range(self.max_loras_per_batch):
                 # Prioritize empty slots
                 if self.buffer_id_to_uid[buffer_id] == "":
-                    return buffer_id, ""
+                    return buffer_id
             for buffer_id in range(self.max_loras_per_batch):
                 # Evict unneeded lora
                 if self.buffer_id_to_uid[buffer_id] not in cur_uids:
-                    return buffer_id, self.buffer_id_to_uid[buffer_id]
+                    self.uid_to_buffer_id.pop(self.buffer_id_to_uid[buffer_id])
+                    return buffer_id
             raise ValueError(
                 "No available buffer slots found. Please ensure the number of active loras is less than max_loras_per_batch."
@@ -145,17 +146,20 @@ class LoRAMemoryPool:
         for uid in cur_uids:
             if uid not in self.uid_to_buffer_id:
-                buffer_id, evicted_lora_uid = get_available_buffer_slot()
-                if evicted_lora_uid != "":
-                    self.uid_to_buffer_id.pop(evicted_lora_uid)
+                buffer_id = get_available_buffer_slot()
+                lora_adapter = lora_adapters.get(uid, None)
                 self.load_lora_weight_to_buffer(
-                    uid, buffer_id, lora_adapters.get(uid, None)
+                    uid, buffer_id, lora_adapter, lora_modules
                 )
                 self.uid_to_buffer_id[uid] = buffer_id
                 self.buffer_id_to_uid[buffer_id] = uid
     def load_lora_weight_to_buffer(
-        self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
+        self,
+        uid: str,
+        buffer_id: int,
+        lora_adapter: LoRAAdapter,
+        lora_modules: Dict[int, Dict[str, BaseLayerWithLoRA]],
     ):
         def check_lora_weight_shape(buffer_view: torch.Tensor, weight: torch.Tensor):
             assert (
@@ -187,8 +191,8 @@ class LoRAMemoryPool:
                     temp_B_buffer[lora_weight_name] = weights
             if self.tp_size > 1:
-                cur_layer_modules = self.lora_modules[layer_id]
-                for module_name, module in cur_layer_modules:
+                cur_layer_modules = lora_modules[layer_id]
+                for module_name, module in cur_layer_modules.items():
                     if "qkv_proj" in module_name:
                         temp_A_buffer["qkv_proj"] = module.slice_lora_a_weights(
                             temp_A_buffer["qkv_proj"], self.tp_rank
@@ -237,7 +241,6 @@ class LoRAMemoryPool:
     def get_tensor(
         self, weight_name: str, layer_id: int, lora_type: LoRAType
     ) -> torch.Tensor:
         if lora_type == LoRAType.LORA_A:
             return self.A_buffer[weight_name][layer_id]

sglang/srt/lora/utils.py CHANGED Viewed

@@ -108,7 +108,7 @@ def get_hidden_dim(
 def get_normalized_lora_weight_names(name: str) -> Tuple[List[str], List[str]]:
     """
-    Mapping a target module name to names of the normized LoRA weights.
+    Mapping a target module name to names of the normalized LoRA weights.
     Returned tuple contains (name for Lora A, name for Lora B)
     """
     params_mapping = {

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -18,33 +18,50 @@ import logging
 import math
 import threading
 from queue import Empty, Full, PriorityQueue, Queue
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional
 import torch
-from sglang.srt.mem_cache.memory_pool import HostKVCache, TokenToKVPoolAllocator
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+    from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 logger = logging.getLogger(__name__)
 class LayerDoneCounter:
     def __init__(self, num_layers):
-        self.counter = num_layers
-        self.condition = threading.Condition()
+        self.num_layers = num_layers
+        # extra producer and consumer counters for overlap mode
+        self.num_counters = 3
+        self.counters = [num_layers] * self.num_counters
+        self.conditions = [threading.Condition() for _ in range(self.num_counters)]
+        self.producer_index = 0
+        self.consumer_index = 0
+    def next_producer(self):
+        return (self.producer_index + 1) % self.num_counters
+    def update_producer(self):
+        self.producer_index = self.next_producer()
+        return self.producer_index
+    def set_consumer(self, index):
+        self.consumer_index = index
     def increment(self):
-        with self.condition:
-            self.counter += 1
-            self.condition.notify_all()
+        with self.conditions[self.producer_index]:
+            self.counters[self.producer_index] += 1
+            self.conditions[self.producer_index].notify_all()
     def wait_until(self, threshold):
-        with self.condition:
-            while self.counter <= threshold:
-                self.condition.wait()
+        with self.conditions[self.consumer_index]:
+            while self.counters[self.consumer_index] <= threshold:
+                self.conditions[self.consumer_index].wait()
     def reset(self):
-        with self.condition:
-            self.counter = 0
+        with self.conditions[self.producer_index]:
+            self.counters[self.producer_index] = 0
 class CacheOperation:
@@ -147,7 +164,7 @@ class HiCacheController:
     def __init__(
         self,
-        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
         mem_pool_host: HostKVCache,
         page_size: int,
         load_cache_event: threading.Event = None,
@@ -295,7 +312,6 @@ class HiCacheController:
         while not self.stop_event.is_set():
             try:
                 operation = self.load_queue.get(block=True, timeout=1)
-                # time.sleep(18e-6 * len(operation.host_indices))
                 operation.data = self.mem_pool_host.get_flat_data(
                     operation.host_indices
                 )
@@ -319,6 +335,7 @@ class HiCacheController:
             if not self.load_cache_event.is_set():
                 continue
             self.load_cache_event.clear()
+            self.layer_done_counter.update_producer()
             batch_operation = None
             while self.load_queue.qsize() > 0:
@@ -330,6 +347,7 @@ class HiCacheController:
             if batch_operation is None:
                 continue
+            # start layer-wise KV cache transfer from CPU to GPU
             self.layer_done_counter.reset()
             for i in range(self.mem_pool_host.layer_num):
                 if self.page_size == 1:
@@ -465,6 +483,7 @@ class HiCacheController:
             except Exception as e:
                 logger.error(e)
+    # todo (zhiqiang): double buffering to be deprecated
     def write_thread_func_buffer(self):
         aux_thread = threading.Thread(target=self.write_aux_func, daemon=True)
         aux_thread.start()

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -87,7 +87,7 @@ class GenerateReqInput:
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
-    # LoRA related
+    # The path to the LoRA
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
     # Session info for continual prompting
@@ -99,7 +99,7 @@ class GenerateReqInput:
     custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
     # Whether to return hidden states
-    return_hidden_states: bool = False
+    return_hidden_states: Union[List[bool], bool] = False
     # For disaggregated inference
     bootstrap_host: Optional[Union[List[str], str]] = None
@@ -226,11 +226,11 @@ class GenerateReqInput:
         # Expand input based on type
         self._expand_inputs(num)
+        self._normalize_rid(num)
         self._normalize_lora_paths(num)
         self._normalize_image_data(num)
         self._normalize_audio_data(num)
         self._normalize_sampling_params(num)
-        self._normalize_rid(num)
         self._normalize_logprob_params(num)
         self._normalize_custom_logit_processor(num)
@@ -409,7 +409,11 @@ class GenerateReqInput:
                 if self.custom_logit_processor is not None
                 else None
             ),
-            return_hidden_states=self.return_hidden_states,
+            return_hidden_states=(
+                self.return_hidden_states[i]
+                if isinstance(self.return_hidden_states, list)
+                else self.return_hidden_states
+            ),
             # if `__getitem__` is called, the bootstrap_host, bootstrap_port, bootstrap_room must be a list
             bootstrap_host=(
                 self.bootstrap_host[i] if self.bootstrap_host is not None else None
@@ -477,7 +481,7 @@ class TokenizedGenerateReqInput:
 @dataclass
 class EmbeddingReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
-    text: Optional[Union[List[str], str]] = None
+    text: Optional[Union[List[List[str]], List[str], str]] = None
     # The image input. It can be an image instance, file name, URL, or base64 encoded string.
     # Can be formatted as:
     # - Single image for a single request
@@ -501,6 +505,8 @@ class EmbeddingReqInput:
     log_metrics: bool = True
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
+    # For cross-encoder requests
+    is_cross_encoder_request: bool = False
     def contains_mm_input(self) -> bool:
         return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
@@ -524,6 +530,7 @@ class EmbeddingReqInput:
         if self.text is not None:
             if isinstance(self.text, list):
                 self.batch_size += len(self.text)
+                self.is_single = False
             else:
                 self.batch_size += 1
@@ -531,12 +538,10 @@ class EmbeddingReqInput:
         if self.input_ids is not None:
             if isinstance(self.input_ids[0], list):
                 self.batch_size += len(self.input_ids)
+                self.is_single = False
             else:
                 self.batch_size += 1
-        if self.batch_size > 1:
-            self.is_single = False
         # Fill in default arguments
         if self.is_single:
             if self.rid is None:
@@ -560,6 +565,16 @@ class EmbeddingReqInput:
         return self.rid
     def __getitem__(self, i):
+        if self.is_cross_encoder_request:
+            return EmbeddingReqInput(
+                text=[self.text[i]] if self.text is not None else None,
+                input_ids=None,
+                image_data=None,
+                sampling_params=self.sampling_params[i],
+                rid=self.rid[i],
+                is_cross_encoder_request=True,
+            )
         return EmbeddingReqInput(
             text=self.text[i] if self.text is not None else None,
             input_ids=self.input_ids[i] if self.input_ids is not None else None,
@@ -579,6 +594,8 @@ class TokenizedEmbeddingReqInput:
     input_ids: List[int]
     # The image inputs
     image_inputs: dict
+    # The token type ids
+    token_type_ids: List[int]
     # Dummy sampling params for compatibility
     sampling_params: SamplingParams
@@ -794,7 +811,9 @@ class GetWeightsByNameReqOutput:
 @dataclass
 class ReleaseMemoryOccupationReqInput:
-    pass
+    # Optional tags to identify the memory region, which is primarily used for RL
+    # Currently we only support `weights` and `kv_cache`
+    tags: Optional[List[str]] = None
 @dataclass
@@ -804,7 +823,9 @@ class ReleaseMemoryOccupationReqOutput:
 @dataclass
 class ResumeMemoryOccupationReqInput:
-    pass
+    # Optional tags to identify the memory region, which is primarily used for RL
+    # Currently we only support `weights` and `kv_cache`
+    tags: Optional[List[str]] = None
 @dataclass

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -146,7 +146,7 @@ class BaseMultimodalProcessor(ABC):
         request_obj,
         max_req_input_len,
         **kwargs,
-    ):
+    ) -> Optional[Dict[str, Any]]:
         pass
     def get_estimated_frames_list(self, image_data):
@@ -261,7 +261,7 @@ class BaseMultimodalProcessor(ABC):
     def load_mm_data(
         self,
-        prompt: str,
+        prompt: str | List[int],
         multimodal_tokens: MultimodalSpecialTokens,
         max_req_input_len: int,
         image_data: Optional[list] = None,

sglang/srt/managers/multimodal_processors/vila.py ADDED Viewed

@@ -0,0 +1,85 @@
+from typing import Any, Dict, List, Optional, Type, cast
+import torch.nn as nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from sglang.srt.managers.io_struct import (
+    EmbeddingReqInput,
+    GenerateReqInput,
+    ImageDataItem,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.vila import VILAForConditionalGeneration
+from sglang.srt.server_args import ServerArgs
+class VILAProcessor(ProcessorMixin):
+    """A stub class for the VILA processor."""
+    tokenizer: PreTrainedTokenizerBase
+class VILAMultimodalProcessor(BaseMultimodalProcessor):
+    models: List[Type[nn.Module]] = [VILAForConditionalGeneration]
+    _processor: VILAProcessor
+    def __init__(
+        self,
+        hf_config: PretrainedConfig,
+        server_args: ServerArgs,
+        _processor: VILAProcessor,
+    ) -> None:
+        super().__init__(hf_config, server_args, _processor)
+    async def process_mm_data_async(
+        self,
+        image_data: Optional[ImageDataItem | List[ImageDataItem]],
+        input_text: str | List[int],
+        request_obj: GenerateReqInput | EmbeddingReqInput,
+        max_req_input_len: int,
+        **kwargs,
+    ) -> Optional[Dict[str, Any]]:
+        if not image_data:
+            return None
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        mm_data = self.load_mm_data(
+            prompt=input_text,
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self._processor.tokenizer.image_token
+            ),
+            max_req_input_len=max_req_input_len,
+            image_data=image_data,
+        )
+        inputs = self.process_mm_data(
+            input_text=mm_data.input_text,
+            images=mm_data.images,
+        )
+        image_offsets = self.get_mm_items_offset(
+            input_ids=inputs.input_ids[0],
+            mm_token_id=cast(int, self._processor.tokenizer.image_token_id),
+        )
+        mm_items: List[MultimodalDataItem] = [
+            MultimodalDataItem(
+                modality=Modality.IMAGE,
+                image_offsets=image_offsets,
+                pixel_values=inputs.pixel_values,
+            )
+        ]
+        return dict(
+            input_ids=inputs.input_ids[0].tolist(),
+            mm_items=mm_items,
+        )

sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl