PyPI - sglang - Versions diffs - 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl - Mend

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_offline_throughput.py +4 -2
sglang/bench_one_batch.py +3 -13
sglang/bench_one_batch_server.py +143 -15
sglang/bench_serving.py +158 -8
sglang/compile_deep_gemm.py +1 -1
sglang/eval/loogle_eval.py +157 -0
sglang/lang/chat_template.py +119 -75
sglang/lang/tracer.py +1 -1
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +5 -2
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/internvl.py +696 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +18 -0
sglang/srt/constrained/base_grammar_backend.py +55 -72
sglang/srt/constrained/llguidance_backend.py +25 -21
sglang/srt/constrained/outlines_backend.py +27 -26
sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
sglang/srt/constrained/xgrammar_backend.py +71 -53
sglang/srt/conversation.py +78 -46
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +11 -3
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +74 -23
sglang/srt/disaggregation/mooncake/conn.py +236 -138
sglang/srt/disaggregation/nixl/conn.py +242 -71
sglang/srt/disaggregation/prefill.py +7 -4
sglang/srt/disaggregation/utils.py +51 -2
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -8
sglang/srt/distributed/device_communicators/npu_communicator.py +39 -0
sglang/srt/distributed/device_communicators/pynccl.py +2 -1
sglang/srt/distributed/device_communicators/shm_broadcast.py +2 -1
sglang/srt/distributed/parallel_state.py +22 -1
sglang/srt/entrypoints/engine.py +31 -4
sglang/srt/entrypoints/http_server.py +45 -3
sglang/srt/entrypoints/verl_engine.py +3 -2
sglang/srt/function_call_parser.py +2 -2
sglang/srt/hf_transformers_utils.py +20 -1
sglang/srt/layers/attention/flashattention_backend.py +147 -51
sglang/srt/layers/attention/flashinfer_backend.py +23 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +62 -15
sglang/srt/layers/attention/merge_state.py +46 -0
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
sglang/srt/layers/attention/triton_ops/merge_state.py +96 -0
sglang/srt/layers/attention/utils.py +4 -2
sglang/srt/layers/attention/vision.py +290 -163
sglang/srt/layers/dp_attention.py +71 -21
sglang/srt/layers/layernorm.py +1 -1
sglang/srt/layers/logits_processor.py +46 -11
sglang/srt/layers/moe/ep_moe/kernels.py +343 -8
sglang/srt/layers/moe/ep_moe/layer.py +121 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +97 -54
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/topk.py +1 -1
sglang/srt/layers/quantization/__init__.py +1 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -2
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -4
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +2 -1
sglang/srt/layers/quantization/deep_gemm.py +77 -71
sglang/srt/layers/quantization/fp8.py +110 -97
sglang/srt/layers/quantization/fp8_kernel.py +81 -62
sglang/srt/layers/quantization/fp8_utils.py +71 -23
sglang/srt/layers/quantization/int8_kernel.py +2 -2
sglang/srt/layers/quantization/kv_cache.py +3 -10
sglang/srt/layers/quantization/utils.py +0 -5
sglang/srt/layers/quantization/w8a8_fp8.py +8 -10
sglang/srt/layers/sampler.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +18 -7
sglang/srt/lora/lora_manager.py +11 -14
sglang/srt/lora/mem_pool.py +4 -4
sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +115 -119
sglang/srt/managers/data_parallel_controller.py +3 -3
sglang/srt/managers/detokenizer_manager.py +21 -8
sglang/srt/managers/io_struct.py +13 -1
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/multimodal_processors/base_processor.py +5 -0
sglang/srt/managers/multimodal_processors/internvl.py +232 -0
sglang/srt/managers/multimodal_processors/llava.py +46 -0
sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
sglang/srt/managers/schedule_batch.py +93 -23
sglang/srt/managers/schedule_policy.py +11 -8
sglang/srt/managers/scheduler.py +140 -100
sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
sglang/srt/managers/tokenizer_manager.py +157 -47
sglang/srt/managers/tp_worker.py +21 -21
sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
sglang/srt/mem_cache/chunk_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +4 -2
sglang/srt/metrics/collector.py +312 -37
sglang/srt/model_executor/cuda_graph_runner.py +10 -11
sglang/srt/model_executor/forward_batch_info.py +1 -1
sglang/srt/model_executor/model_runner.py +57 -41
sglang/srt/model_loader/loader.py +18 -11
sglang/srt/models/clip.py +4 -4
sglang/srt/models/deepseek_janus_pro.py +3 -3
sglang/srt/models/deepseek_nextn.py +1 -20
sglang/srt/models/deepseek_v2.py +77 -39
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/internlm2.py +3 -0
sglang/srt/models/internvl.py +670 -0
sglang/srt/models/llama.py +3 -1
sglang/srt/models/llama4.py +58 -13
sglang/srt/models/llava.py +248 -5
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral.py +98 -34
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/phi3_small.py +16 -2
sglang/srt/models/pixtral.py +467 -0
sglang/srt/models/qwen2_5_vl.py +8 -4
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/roberta.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/xiaomi_mimo.py +171 -0
sglang/srt/openai_api/adapter.py +52 -42
sglang/srt/openai_api/protocol.py +20 -16
sglang/srt/reasoning_parser.py +1 -1
sglang/srt/sampling/custom_logit_processor.py +18 -3
sglang/srt/sampling/sampling_batch_info.py +2 -2
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +64 -10
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/speculative/eagle_utils.py +7 -7
sglang/srt/speculative/eagle_worker.py +22 -19
sglang/srt/utils.py +41 -6
sglang/test/few_shot_gsm8k.py +2 -2
sglang/test/few_shot_gsm8k_engine.py +2 -2
sglang/test/run_eval.py +2 -2
sglang/test/runners.py +8 -1
sglang/test/send_one.py +13 -3
sglang/test/simple_eval_common.py +1 -1
sglang/test/simple_eval_humaneval.py +1 -1
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deepep_utils.py +219 -0
sglang/test/test_programs.py +5 -5
sglang/test/test_utils.py +92 -15
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/METADATA +18 -9
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/RECORD +150 -137
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/WHEEL +1 -1
/sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post2.dist-info → sglang-0.4.6.post4.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -268,98 +268,97 @@ class HiCacheController:
         """
         Directly write through KV caches to host memory without buffering.
         """
-        with torch.cuda.stream(self.write_stream):
-            while not self.stop_event.is_set():
-                try:
-                    operation = self.write_queue.get(block=True, timeout=1)
-                    self.mem_pool_host.write_page_all_layers(
-                        operation.host_indices,
-                        operation.device_indices,
-                        self.mem_pool_device,
-                    )
-                    self.write_stream.synchronize()
-                    self.mem_pool_host.complete_io(operation.host_indices)
-                    for node_id in operation.node_ids:
-                        if node_id != 0:
-                            self.ack_write_queue.put(node_id)
-                except Empty:
-                    continue
-                except Exception as e:
-                    logger.error(e)
+        torch.cuda.set_stream(self.write_stream)
+        while not self.stop_event.is_set():
+            try:
+                operation = self.write_queue.get(block=True, timeout=1)
+                self.mem_pool_host.write_page_all_layers(
+                    operation.host_indices,
+                    operation.device_indices,
+                    self.mem_pool_device,
+                )
+                self.write_stream.synchronize()
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    if node_id != 0:
+                        self.ack_write_queue.put(node_id)
+            except Empty:
+                continue
+            except Exception as e:
+                logger.error(e)
     def load_thread_func_direct(self):
         """
         Directly load KV caches from host memory to device memory without buffering.
         """
-        with torch.cuda.stream(self.load_stream):
-            while not self.stop_event.is_set():
-                try:
-                    operation = self.load_queue.get(block=True, timeout=1)
-                    # time.sleep(18e-6 * len(operation.host_indices))
-                    operation.data = self.mem_pool_host.get_flat_data(
-                        operation.host_indices
-                    )
-                    self.mem_pool_device.transfer(
-                        operation.device_indices, operation.data
-                    )
-                    self.mem_pool_host.complete_io(operation.host_indices)
-                    for node_id in operation.node_ids:
-                        if node_id != 0:
-                            self.ack_load_queue.put(node_id)
-                except Empty:
-                    continue
-                except Exception as e:
-                    logger.error(e)
+        torch.cuda.set_stream(self.load_stream)
+        while not self.stop_event.is_set():
+            try:
+                operation = self.load_queue.get(block=True, timeout=1)
+                # time.sleep(18e-6 * len(operation.host_indices))
+                operation.data = self.mem_pool_host.get_flat_data(
+                    operation.host_indices
+                )
+                self.mem_pool_device.transfer(operation.device_indices, operation.data)
+                self.mem_pool_host.complete_io(operation.host_indices)
+                for node_id in operation.node_ids:
+                    if node_id != 0:
+                        self.ack_load_queue.put(node_id)
+            except Empty:
+                continue
+            except Exception as e:
+                logger.error(e)
     def load_thread_func_layer_by_layer(self):
         """
         Load KV caches from host memory to device memory layer by layer.
         """
-        with torch.cuda.stream(self.load_stream):
-            while not self.stop_event.is_set():
-                self.load_cache_event.wait(timeout=1)
-                if not self.load_cache_event.is_set():
-                    continue
-                self.load_cache_event.clear()
+        torch.cuda.set_stream(self.load_stream)
+        while not self.stop_event.is_set():
+            self.load_cache_event.wait(timeout=1)
+            if not self.load_cache_event.is_set():
+                continue
+            self.load_cache_event.clear()
-                batch_operation = None
-                while self.load_queue.qsize() > 0:
-                    op = self.load_queue.get(block=True)
-                    if batch_operation is None:
-                        batch_operation = op
-                    else:
-                        batch_operation.merge(op)
+            batch_operation = None
+            while self.load_queue.qsize() > 0:
+                op = self.load_queue.get(block=True)
                 if batch_operation is None:
-                    continue
+                    batch_operation = op
+                else:
+                    batch_operation.merge(op)
+            if batch_operation is None:
+                continue
-                self.layer_done_counter.reset()
-                for i in range(self.mem_pool_host.layer_num):
-                    if self.page_size == 1:
-                        flat_data = self.mem_pool_host.get_flat_data_by_layer(
-                            batch_operation.host_indices, i
-                        )
-                        self.mem_pool_device.transfer_per_layer(
-                            batch_operation.device_indices, flat_data, i
-                        )
-                    else:
-                        self.mem_pool_host.load_page_per_layer(
-                            batch_operation.host_indices,
-                            batch_operation.device_indices,
-                            self.mem_pool_device,
-                            i,
-                        )
-                        self.load_stream.synchronize()
-                    self.layer_done_counter.increment()
-                self.mem_pool_host.complete_io(batch_operation.host_indices)
-                for node_id in batch_operation.node_ids:
-                    if node_id != 0:
-                        self.ack_load_queue.put(node_id)
+            self.layer_done_counter.reset()
+            for i in range(self.mem_pool_host.layer_num):
+                if self.page_size == 1:
+                    flat_data = self.mem_pool_host.get_flat_data_by_layer(
+                        batch_operation.host_indices, i
+                    )
+                    self.mem_pool_device.transfer_per_layer(
+                        batch_operation.device_indices, flat_data, i
+                    )
+                else:
+                    self.mem_pool_host.load_page_per_layer(
+                        batch_operation.host_indices,
+                        batch_operation.device_indices,
+                        self.mem_pool_device,
+                        i,
+                    )
+                    self.load_stream.synchronize()
+                self.layer_done_counter.increment()
+            self.mem_pool_host.complete_io(batch_operation.host_indices)
+            for node_id in batch_operation.node_ids:
+                if node_id != 0:
+                    self.ack_load_queue.put(node_id)
     def write_aux_func(self, no_wait=False):
         """
         Auxiliary function to prepare the buffer for write operations.
         """
+        torch.cuda.set_stream(self.write_stream)
         def _to_op(op_):
             assert op_.device_indices.is_cuda, "Device indices should be on GPU"
@@ -370,44 +369,42 @@ class HiCacheController:
             return op_
         buffer = None
-        with torch.cuda.stream(self.write_stream):
-            while not self.stop_event.is_set():
-                try:
-                    operation = self.write_queue.get(block=True, timeout=1)
-                    factor = (
-                        len(operation.device_indices)
-                        // self.write_buffer.max_buffer_size
-                    )
+        while not self.stop_event.is_set():
+            try:
+                operation = self.write_queue.get(block=True, timeout=1)
+                factor = (
+                    len(operation.device_indices) // self.write_buffer.max_buffer_size
+                )
-                    if factor >= 1:
-                        if buffer is not None:
-                            _to_op(buffer)
-                            buffer = None
-                        if factor < 2:
-                            _to_op(operation)
-                        else:
-                            split_ops = operation.split(factor)
-                            for op_ in split_ops:
-                                _to_op(op_)
-                        continue
-                    if buffer is None:
-                        buffer = operation
-                    else:
-                        buffer.merge(operation)
-                    if (
-                        no_wait
-                        or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
-                        or self.write_queue.empty()
-                        or self.write_buffer.empty()
-                    ):
+                if factor >= 1:
+                    if buffer is not None:
                         _to_op(buffer)
                         buffer = None
-                except Empty:
+                    if factor < 2:
+                        _to_op(operation)
+                    else:
+                        split_ops = operation.split(factor)
+                        for op_ in split_ops:
+                            _to_op(op_)
                     continue
-                except Exception as e:
-                    logger.error(e)
+                if buffer is None:
+                    buffer = operation
+                else:
+                    buffer.merge(operation)
+                if (
+                    no_wait
+                    or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
+                    or self.write_queue.empty()
+                    or self.write_buffer.empty()
+                ):
+                    _to_op(buffer)
+                    buffer = None
+            except Empty:
+                continue
+            except Exception as e:
+                logger.error(e)
     def load_aux_func(self):
         """
@@ -484,19 +481,18 @@ class HiCacheController:
         aux_thread.join()
     def load_thread_func_buffer(self):
+        torch.cuda.set_stream(self.load_stream)
         aux_thread = threading.Thread(target=self.load_aux_func, daemon=True)
         aux_thread.start()
-        with torch.cuda.stream(self.load_stream):
-            while not self.stop_event.is_set():
-                operation = self.load_buffer.get()
-                if operation is None:
-                    continue
-                self.mem_pool_device.transfer(operation.device_indices, operation.data)
-                self.mem_pool_host.complete_io(operation.host_indices)
-                for node_id in operation.node_ids:
-                    if node_id != 0:
-                        self.ack_load_queue.put(node_id)
+        while not self.stop_event.is_set():
+            operation = self.load_buffer.get()
+            if operation is None:
+                continue
+            self.mem_pool_device.transfer(operation.device_indices, operation.data)
+            self.mem_pool_host.complete_io(operation.host_indices)
+            for node_id in operation.node_ids:
+                if node_id != 0:
+                    self.ack_load_queue.put(node_id)
         aux_thread.join()
     def evict_device(

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -17,13 +17,13 @@ import logging
 import multiprocessing as mp
 import signal
 import threading
+import time
 from enum import Enum, auto
 import psutil
 import setproctitle
 import zmq
-from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
 from sglang.srt.managers.io_struct import (
     TokenizedEmbeddingReqInput,
@@ -158,7 +158,7 @@ class DataParallelController:
         # This thread cannot be closed because otherwise the `kill_itself_when_parent_died`
         # function in scheduler.py will kill the scheduler.
         while True:
-            pass
+            time.sleep(30 * 24 * 3600)
     def launch_dp_attention_schedulers(self, server_args, port_args):
         self.launch_tensor_parallel_group(server_args, port_args, 0, None)
@@ -210,7 +210,7 @@ class DataParallelController:
                     )
                     # compute zmq ports for this dp rank
                     rank_port_args = PortArgs.init_new(server_args, dp_rank)
-                    # Data parallelism resues the tensor parallelism group,
+                    # Data parallelism reuses the tensor parallelism group,
                     # so all dp ranks should use the same nccl port.
                     rank_port_args.nccl_port = port_args.nccl_port

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -28,6 +28,7 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.io_struct import (
     BatchEmbeddingOut,
     BatchMultimodalDecodeReq,
+    BatchMultimodalOut,
     BatchStrOut,
     BatchTokenIDOut,
 )
@@ -60,6 +61,8 @@ class DecodeStatus:
     decode_ids: List[int]
     surr_offset: int
     read_offset: int
+    # Offset that's sent to tokenizer for incremental update.
+    sent_offset: int = 0
 class DetokenizerManager:
@@ -151,7 +154,7 @@ class DetokenizerManager:
                 self.decode_status[rid] = s
             else:
                 s = self.decode_status[rid]
-                s.decode_ids = recv_obj.decode_ids[i]
+                s.decode_ids.extend(recv_obj.decode_ids[i])
             read_ids.append(
                 self.trim_matched_stop(
@@ -199,13 +202,15 @@ class DetokenizerManager:
                 else:
                     new_text = find_printable_text(new_text)
-            output_strs.append(
-                self.trim_matched_stop(
-                    s.decoded_text + new_text,
-                    recv_obj.finished_reasons[i],
-                    recv_obj.no_stop_trim[i],
-                )
+            output_str = self.trim_matched_stop(
+                s.decoded_text + new_text,
+                recv_obj.finished_reasons[i],
+                recv_obj.no_stop_trim[i],
             )
+            # Incrementally send text.
+            incremental_output = output_str[s.sent_offset :]
+            s.sent_offset = len(output_str)
+            output_strs.append(incremental_output)
         return BatchStrOut(
             rids=recv_obj.rids,
@@ -232,7 +237,15 @@ class DetokenizerManager:
         )
     def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
-        raise NotImplementedError()
+        outputs = self.tokenizer.detokenize(recv_obj)
+        return BatchMultimodalOut(
+            rids=recv_obj.rids,
+            finished_reasons=recv_obj.finished_reasons,
+            outputs=outputs,
+            prompt_tokens=recv_obj.prompt_tokens,
+            completion_tokens=recv_obj.completion_tokens,
+            cached_tokens=recv_obj.cached_tokens,
+        )
 class LimitedCapacityDict(OrderedDict):

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -12,7 +12,7 @@
 # limitations under the License.
 # ==============================================================================
 """
-The definition of objects transfered between different
+The definition of objects transferred between different
 processes (TokenizerManager, DetokenizerManager, Controller).
 """
@@ -790,6 +790,16 @@ class ResumeMemoryOccupationReqOutput:
     pass
+@dataclass
+class SlowDownReqInput:
+    forward_sleep_time: Optional[float]
+@dataclass
+class SlowDownReqOutput:
+    pass
 @dataclass
 class AbortReq:
     # The request id
@@ -826,6 +836,8 @@ class ProfileReqInput:
     # the caller doesn't need to run stop_profile.
     num_steps: Optional[int] = None
     activities: Optional[List[Literal["CPU", "GPU", "MEM", "CUDA_PROFILER"]]] = None
+    with_stack: Optional[bool] = None
+    record_shapes: Optional[bool] = None
 class ProfileReqType(Enum):

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
         self, input_ids: List[int], mm_inputs: MultimodalInputs
     ) -> List[int]:
         """
-        This function will replace the data-tokens inbetween with pad_values accordingly
+        This function will replace the data-tokens in between with pad_values accordingly
         """
         pad_values = [item.pad_value for item in mm_inputs.mm_items]
         data_token_pairs = self.data_token_id_pairs

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import List, Optional
 import numpy as np
 import PIL
+import torch
 from PIL import Image
 from transformers import BaseImageProcessorFast
@@ -89,6 +90,10 @@ class BaseMultimodalProcessor(ABC):
             return_tensors="pt",
             **kwargs,
         )
+        if "pixel_values" in result and isinstance(
+            result["pixel_values"], torch.Tensor
+        ):
+            result["pixel_values"] = result["pixel_values"].to("cpu")
         return result
     @abstractmethod

sglang/srt/managers/multimodal_processors/internvl.py ADDED Viewed

@@ -0,0 +1,232 @@
+# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+import numpy as np
+import torch
+from decord import VideoReader, cpu
+from numpy.distutils.cpuinfo import cpu
+from PIL import Image
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.internvl import InternVLChatModel
+class InternVLImageProcessor(BaseMultimodalProcessor):
+    models = [InternVLChatModel]
+    def __init__(self, hf_config, server_args, _image_processor):
+        super().__init__(hf_config, server_args, _image_processor)
+        image_size = hf_config.force_image_size or hf_config.vision_config.image_size
+        patch_size = hf_config.vision_config.patch_size
+        self.IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
+        self.IMG_START_TOKEN = "<img>"
+        self.IMG_END_TOKEN = "</img>"
+        self.IMG_TOKEN = "<image>"
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (hf_config.downsample_ratio**2)
+        )
+        tokenizer = self._processor
+        self.img_start_token_id = tokenizer.convert_tokens_to_ids(self.IMG_START_TOKEN)
+        self.img_end_token_id = tokenizer.convert_tokens_to_ids(self.IMG_END_TOKEN)
+        self.img_context_token_id = tokenizer.convert_tokens_to_ids(
+            self.IMG_CONTEXT_TOKEN
+        )
+    @staticmethod
+    def build_transform(input_size):
+        IMAGENET_MEAN = (0.485, 0.456, 0.406)
+        IMAGENET_STD = (0.229, 0.224, 0.225)
+        def resize_image(img, size):
+            return img.resize((size, size), Image.Resampling.BICUBIC)
+        def to_tensor(img):
+            # Convert PIL Image to numpy array
+            img_array = np.array(img).astype(np.float32) / 255.0
+            # Convert HWC to CHW format
+            img_array = img_array.transpose(2, 0, 1)
+            return torch.from_numpy(img_array)
+        def normalize(tensor, mean, std):
+            mean = torch.tensor(mean).view(-1, 1, 1)
+            std = torch.tensor(std).view(-1, 1, 1)
+            return (tensor - mean) / std
+        def transform(img):
+            img = img.convert("RGB") if img.mode != "RGB" else img
+            img = resize_image(img, input_size)
+            tensor = to_tensor(img)
+            tensor = normalize(tensor, IMAGENET_MEAN, IMAGENET_STD)
+            return tensor
+        return transform
+    @staticmethod
+    def dynamic_preprocess(
+        image, min_num=1, max_num=12, image_size=448, use_thumbnail=False
+    ):
+        def find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, width, height, image_size
+        ):
+            best_ratio_diff = float("inf")
+            best_ratio = (1, 1)
+            area = width * height
+            for ratio in target_ratios:
+                target_aspect_ratio = ratio[0] / ratio[1]
+                ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+                if ratio_diff < best_ratio_diff:
+                    best_ratio_diff = ratio_diff
+                    best_ratio = ratio
+                elif ratio_diff == best_ratio_diff:
+                    if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                        best_ratio = ratio
+            return best_ratio
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+        # calculate the existing image aspect ratio
+        target_ratios = set(
+            (i, j)
+            for n in range(min_num, max_num + 1)
+            for i in range(1, n + 1)
+            for j in range(1, n + 1)
+            if i * j <= max_num and i * j >= min_num
+        )
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+        # find the closest aspect ratio to the target
+        target_aspect_ratio = find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size
+        )
+        # calculate the target width and height
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size,
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+    @staticmethod
+    def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+        if bound:
+            start, end = bound[0], bound[1]
+        else:
+            start, end = -100000, 100000
+        start_idx = max(first_idx, round(start * fps))
+        end_idx = min(round(end * fps), max_frame)
+        seg_size = float(end_idx - start_idx) / num_segments
+        frame_indices = np.array(
+            [
+                int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+                for idx in range(num_segments)
+            ]
+        )
+        return frame_indices
+    @staticmethod
+    def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+        vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+        max_frame = len(vr) - 1
+        fps = float(vr.get_avg_fps())
+        pixel_values_list, num_patches_list = [], []
+        transform = InternVLImageProcessor.build_transform(input_size=input_size)
+        frame_indices = InternVLImageProcessor.get_index(
+            bound, fps, max_frame, first_idx=0, num_segments=num_segments
+        )
+        for frame_index in frame_indices:
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
+            img = InternVLImageProcessor.dynamic_preprocess(
+                img, image_size=input_size, use_thumbnail=True, max_num=max_num
+            )
+            pixel_values = [transform(tile) for tile in img]
+            pixel_values = torch.stack(pixel_values)
+            num_patches_list.append(pixel_values.shape[0])
+            pixel_values_list.append(pixel_values)
+        pixel_values = torch.cat(pixel_values_list)
+        return pixel_values, num_patches_list
+    async def process_mm_data_async(
+        self, image_data, input_text, request_obj, max_req_input_len, **kwargs
+    ):
+        if not image_data:
+            return None
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMG_TOKEN),
+            max_req_input_len=max_req_input_len,
+            discard_alpha_channel=True,
+        )
+        def process_image_internvl(image, input_size=448, max_num=12):
+            transform = InternVLImageProcessor.build_transform(input_size=input_size)
+            images = InternVLImageProcessor.dynamic_preprocess(
+                image, image_size=input_size, use_thumbnail=True, max_num=max_num
+            )
+            pixel_values = [transform(image) for image in images]
+            pixel_values = torch.stack(pixel_values)
+            return pixel_values
+        num_patches_list = []
+        pixel_values = []
+        # Process each input with allocated frames
+        for image_index, (image) in enumerate(base_output.images):
+            try:
+                # TODO: video input
+                raw_image = process_image_internvl(image)
+                pixel_value = [raw_image.to(torch.bfloat16).cuda()]
+                pixel_values += pixel_value
+                num_patches = raw_image.shape[0]
+                num_patches_list += [num_patches]
+            except FileNotFoundError as e:
+                print(e)
+                return None
+        pixel_values = torch.cat(pixel_values, dim=0)
+        items = [MultimodalDataItem(pixel_values=pixel_values, modality=Modality.IMAGE)]
+        for idx, num_patches in enumerate(num_patches_list):
+            image_tokens = (
+                self.IMG_START_TOKEN
+                + self.IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
+                + self.IMG_END_TOKEN
+            )
+            input_text = input_text.replace("<image>", image_tokens, 1)
+        tokenizer = self._processor
+        return {
+            "input_ids": tokenizer(input_text, return_tensors="pt")["input_ids"]
+            .flatten()
+            .tolist(),
+            "mm_items": items,
+            "im_start_id": self.img_start_token_id,
+            "im_end_id": self.img_end_token_id,
+            "im_token_id": self.img_context_token_id,
+        }

sglang 0.4.6.post2__py3-none-any.whl → 0.4.6.post4__py3-none-any.whl

sglang 0.4.6.post2py3-none-any.whl → 0.4.6.post4py3-none-any.whl