PyPI - sglang - Versions diffs - 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

sglang 0.3.1.post3py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

sglang/__init__.py +2 -0
sglang/api.py +23 -1
sglang/bench_latency.py +48 -33
sglang/bench_server_latency.py +0 -6
sglang/bench_serving.py +2 -2
sglang/lang/backend/runtime_endpoint.py +14 -1
sglang/lang/interpreter.py +16 -6
sglang/lang/ir.py +20 -4
sglang/srt/configs/model_config.py +11 -9
sglang/srt/constrained/fsm_cache.py +9 -1
sglang/srt/constrained/jump_forward.py +15 -2
sglang/srt/hf_transformers_utils.py +1 -0
sglang/srt/layers/activation.py +4 -4
sglang/srt/layers/attention/__init__.py +49 -0
sglang/srt/layers/attention/flashinfer_backend.py +277 -0
sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
sglang/srt/layers/attention/triton_backend.py +161 -0
sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
sglang/srt/layers/fused_moe/patch.py +117 -0
sglang/srt/layers/layernorm.py +4 -4
sglang/srt/layers/logits_processor.py +19 -15
sglang/srt/layers/pooler.py +3 -3
sglang/srt/layers/quantization/__init__.py +0 -2
sglang/srt/layers/radix_attention.py +6 -4
sglang/srt/layers/sampler.py +6 -4
sglang/srt/layers/torchao_utils.py +18 -0
sglang/srt/lora/lora.py +20 -21
sglang/srt/lora/lora_manager.py +97 -25
sglang/srt/managers/detokenizer_manager.py +31 -18
sglang/srt/managers/image_processor.py +187 -0
sglang/srt/managers/io_struct.py +99 -75
sglang/srt/managers/schedule_batch.py +187 -68
sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
sglang/srt/managers/scheduler.py +1021 -0
sglang/srt/managers/tokenizer_manager.py +120 -247
sglang/srt/managers/tp_worker.py +28 -925
sglang/srt/mem_cache/memory_pool.py +34 -52
sglang/srt/mem_cache/radix_cache.py +5 -5
sglang/srt/model_executor/cuda_graph_runner.py +25 -25
sglang/srt/model_executor/forward_batch_info.py +94 -97
sglang/srt/model_executor/model_runner.py +76 -78
sglang/srt/models/baichuan.py +10 -10
sglang/srt/models/chatglm.py +12 -12
sglang/srt/models/commandr.py +10 -10
sglang/srt/models/dbrx.py +12 -12
sglang/srt/models/deepseek.py +10 -10
sglang/srt/models/deepseek_v2.py +14 -15
sglang/srt/models/exaone.py +10 -10
sglang/srt/models/gemma.py +10 -10
sglang/srt/models/gemma2.py +11 -11
sglang/srt/models/gpt_bigcode.py +10 -10
sglang/srt/models/grok.py +10 -10
sglang/srt/models/internlm2.py +10 -10
sglang/srt/models/llama.py +22 -10
sglang/srt/models/llama_classification.py +5 -5
sglang/srt/models/llama_embedding.py +4 -4
sglang/srt/models/llama_reward.py +142 -0
sglang/srt/models/llava.py +39 -33
sglang/srt/models/llavavid.py +31 -28
sglang/srt/models/minicpm.py +10 -10
sglang/srt/models/minicpm3.py +14 -15
sglang/srt/models/mixtral.py +10 -10
sglang/srt/models/mixtral_quant.py +10 -10
sglang/srt/models/olmoe.py +10 -10
sglang/srt/models/qwen.py +10 -10
sglang/srt/models/qwen2.py +11 -11
sglang/srt/models/qwen2_moe.py +10 -10
sglang/srt/models/stablelm.py +10 -10
sglang/srt/models/torch_native_llama.py +506 -0
sglang/srt/models/xverse.py +10 -10
sglang/srt/models/xverse_moe.py +10 -10
sglang/srt/openai_api/adapter.py +7 -0
sglang/srt/sampling/sampling_batch_info.py +36 -27
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +170 -119
sglang/srt/server_args.py +54 -27
sglang/srt/utils.py +101 -128
sglang/test/runners.py +76 -33
sglang/test/test_programs.py +38 -5
sglang/test/test_utils.py +53 -9
sglang/version.py +1 -1
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/METADATA +42 -23
sglang-0.3.3.dist-info/RECORD +139 -0
sglang/srt/layers/attention_backend.py +0 -482
sglang/srt/managers/controller_multi.py +0 -207
sglang/srt/managers/controller_single.py +0 -164
sglang-0.3.1.post3.dist-info/RECORD +0 -134
/sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
/sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -16,17 +16,13 @@ limitations under the License.
 """TokenizerManager is a process that tokenizes the text."""
 import asyncio
-import concurrent.futures
 import dataclasses
 import json
 import logging
-import multiprocessing as mp
 import os
 from typing import Dict, List, Optional, Tuple, Union
 import fastapi
-import numpy as np
-import transformers
 import uvloop
 import zmq
 import zmq.asyncio
@@ -38,6 +34,10 @@ from sglang.srt.hf_transformers_utils import (
     get_processor,
     get_tokenizer,
 )
+from sglang.srt.managers.image_processor import (
+    get_dummy_image_processor,
+    get_image_processor,
+)
 from sglang.srt.managers.io_struct import (
     AbortReq,
     BatchEmbeddingOut,
@@ -46,16 +46,16 @@ from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
     FlushCacheReq,
     GenerateReqInput,
+    RewardReqInput,
     TokenizedEmbeddingReqInput,
     TokenizedGenerateReqInput,
+    TokenizedRewardReqInput,
     UpdateWeightReqInput,
     UpdateWeightReqOutput,
 )
-from sglang.srt.mm_utils import expand2square, process_anyres_image
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import is_generation_model, is_multimodal_model, load_image
-from sglang.utils import get_exception_traceback
+from sglang.srt.utils import is_generation_model, is_multimodal_model
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
@@ -84,10 +84,10 @@ class TokenizerManager:
         # Init inter-process communication
         context = zmq.asyncio.Context(2)
         self.recv_from_detokenizer = context.socket(zmq.PULL)
-        self.recv_from_detokenizer.bind(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
+        self.recv_from_detokenizer.bind(f"ipc://{port_args.tokenizer_ipc_name}")
-        self.send_to_controller = context.socket(zmq.PUSH)
-        self.send_to_controller.connect(f"tcp://127.0.0.1:{port_args.controller_port}")
+        self.send_to_scheduler = context.socket(zmq.PUSH)
+        self.send_to_scheduler.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
         # Read model args
         self.model_path = server_args.model_path
@@ -103,6 +103,8 @@ class TokenizerManager:
         self.context_len = server_args.context_length or get_context_length(
             self.hf_config
         )
+        # Create image processor placeholder
+        self.image_processor = get_dummy_image_processor()
         # Create tokenizer
         if server_args.skip_tokenizer_init:
@@ -117,12 +119,9 @@ class TokenizerManager:
                 self.tokenizer = self.processor.tokenizer
                 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-                # We want to parallelize the image pre-processing so we
-                # create an executor for it
-                self.executor = concurrent.futures.ProcessPoolExecutor(
-                    initializer=init_global_processor,
-                    mp_context=mp.get_context("fork"),
-                    initargs=(server_args,),
+                # We want to parallelize the image pre-processing so we create an executor for it
+                self.image_processor = get_image_processor(
+                    self.hf_config, server_args, self.processor.image_processor
                 )
             else:
                 self.tokenizer = get_tokenizer(
@@ -141,7 +140,7 @@ class TokenizerManager:
     async def generate_request(
         self,
-        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
         request: Optional[fastapi.Request] = None,
     ):
         if self.to_create_loop:
@@ -160,53 +159,72 @@ class TokenizerManager:
             async for response in self._handle_batch_request(obj, request):
                 yield response
-    async def _handle_single_request(
+    async def _send_single_request(
         self,
-        obj: Union[GenerateReqInput, EmbeddingReqInput],
-        request: Optional[fastapi.Request] = None,
+        obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
         index: Optional[int] = None,
+        input_id_index: Optional[int] = None,
         is_cache_for_prefill: Optional[bool] = False,
     ):
         if not is_cache_for_prefill:  # The normal case with a single prompt
-            not_use_index = index is None
+            if index is None:
+                rid = obj.rid
+                if hasattr(obj, "conv"):
+                    # reward model
+                    conv = obj.conv
+                    input_text = self.tokenizer.apply_chat_template(
+                        conv, tokenize=False
+                    )
+                    input_ids = self.tokenizer.encode(input_text)
+                elif obj.input_ids is None:
+                    input_text = obj.text
+                    input_ids = self.tokenizer.encode(input_text)
+                else:
+                    input_text = obj.text if obj.text is not None else None
+                    input_ids = obj.input_ids
-            rid = obj.rid if not_use_index else obj.rid[index]
-            input_text = obj.text if not_use_index else obj.text[index]
-            if obj.input_ids is None:
-                assert self.tokenizer is not None
-                input_ids = self.tokenizer.encode(input_text)
+                sampling_params = self._get_sampling_params(obj.sampling_params)
+                if self.is_generation:
+                    image_inputs = await self.image_processor.process_images_async(
+                        obj.image_data, obj
+                    )
+                    return_logprob = obj.return_logprob
+                    logprob_start_len = obj.logprob_start_len
+                    top_logprobs_num = obj.top_logprobs_num
             else:
-                input_ids = obj.input_ids if not_use_index else obj.input_ids[index]
+                rid = obj.rid[index]
+                if hasattr(obj, "conv"):
+                    # reward model
+                    conv = obj.conv[index]
+                    input_text = self.tokenizer.apply_chat_template(
+                        conv, tokenize=False
+                    )
+                    input_ids = self.tokenizer.encode(input_text)
+                elif obj.input_ids is None:
+                    input_text = obj.text[input_id_index]
+                    input_ids = self.tokenizer.encode(input_text)
+                else:
+                    input_text = (
+                        obj.text[input_id_index] if obj.text is not None else None
+                    )
+                    input_ids = obj.input_ids[input_id_index]
-            self._validate_input_length(input_ids)
+                sampling_params = self._get_sampling_params(obj.sampling_params[index])
+                if self.is_generation:
+                    image_inputs = await self.image_processor.process_images_async(
+                        obj.image_data[index], obj
+                    )
+                    return_logprob = obj.return_logprob[index]
+                    logprob_start_len = obj.logprob_start_len[index]
+                    top_logprobs_num = obj.top_logprobs_num[index]
-            sampling_params = self._get_sampling_params(
-                obj.sampling_params if not_use_index else obj.sampling_params[index]
-            )
+            self._validate_input_length(input_ids)
-            if self.is_generation:
-                pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
-                    obj.image_data if not_use_index else obj.image_data[index]
-                )
-                modalities = obj.modalities
-                return_logprob = (
-                    obj.return_logprob if not_use_index else obj.return_logprob[index]
-                )
-                logprob_start_len = (
-                    obj.logprob_start_len
-                    if not_use_index
-                    else obj.logprob_start_len[index]
-                )
-                top_logprobs_num = (
-                    obj.top_logprobs_num
-                    if not_use_index
-                    else obj.top_logprobs_num[index]
-                )
         else:  # A prefill request to cache the common prompt for parallel sampling
             assert self.is_generation
             if obj.text is not None:
                 if isinstance(obj.text, list):
-                    input_text = obj.text[index]
+                    input_text = obj.text[input_id_index]
                     rid = obj.rid[index]
                 else:
                     input_text = obj.text
@@ -220,7 +238,7 @@ class TokenizerManager:
                         obj.input_ids[0], list
                     ):
                         # when obj["input_ids"] is List[List[int]]
-                        input_ids = obj.input_ids[index]
+                        input_ids = obj.input_ids[input_id_index]
                         rid = obj.rid[index]
                     else:
                         input_ids = obj.input_ids
@@ -231,7 +249,7 @@ class TokenizerManager:
                     obj.input_ids[0], list
                 ):
                     # when obj["input_ids"] is List[List[int]]
-                    input_ids = obj.input_ids[index]
+                    input_ids = obj.input_ids[input_id_index]
                     rid = obj.rid[index]
                 else:
                     input_ids = obj.input_ids
@@ -239,10 +257,9 @@ class TokenizerManager:
             sampling_params = SamplingParams(**obj.sampling_params[0])
             sampling_params.max_new_tokens = 0
-            pixel_values, image_hashes, image_sizes = await self._get_pixel_values(
-                obj.image_data[0]
+            image_inputs = await self.image_processor.process_images_async(
+                obj.image_data[0], obj
             )
-            modalities = obj.modalities
             return_logprob = obj.return_logprob[0]
             logprob_start_len = obj.logprob_start_len[0]
             top_logprobs_num = obj.top_logprobs_num[0]
@@ -253,34 +270,57 @@ class TokenizerManager:
                 rid,
                 input_text,
                 input_ids,
-                pixel_values,
-                image_hashes,
-                image_sizes,
+                image_inputs,
                 sampling_params,
                 return_logprob,
                 logprob_start_len,
                 top_logprobs_num,
                 obj.stream,
-                modalities,
                 (
-                    obj.lora_path[index]
+                    obj.lora_path[input_id_index]
                     if isinstance(obj.lora_path, list)
                     else obj.lora_path
                 ),
             )
-        else:  # is embedding
+        elif isinstance(obj, EmbeddingReqInput):
             tokenized_obj = TokenizedEmbeddingReqInput(
                 rid,
                 input_text,
                 input_ids,
                 sampling_params,
             )
-        self.send_to_controller.send_pyobj(tokenized_obj)
+        else:
+            assert isinstance(obj, RewardReqInput)
+            tokenized_obj = TokenizedRewardReqInput(
+                rid,
+                input_text,
+                input_ids,
+                sampling_params,
+            )
+        self.send_to_scheduler.send_pyobj(tokenized_obj)
+        return rid, input_ids
+    async def _handle_single_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
+        request: Optional[fastapi.Request] = None,
+        index: Optional[int] = None,
+        input_id_index: Optional[int] = None,
+        is_cache_for_prefill: Optional[bool] = False,
+    ):
+        rid, input_ids = await self._send_single_request(
+            obj,
+            index,
+            input_id_index=input_id_index,
+            is_cache_for_prefill=is_cache_for_prefill,
+        )
         # Recv results
         event = asyncio.Event()
         state = ReqState([], False, event)
         self.rid_to_state[rid] = state
         if not is_cache_for_prefill:
             async for response in self._wait_for_response(state, obj, rid, request):
                 yield response
@@ -291,7 +331,7 @@ class TokenizerManager:
     async def _handle_batch_request(
         self,
-        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
         request: Optional[fastapi.Request] = None,
     ):
         batch_size = obj.batch_size
@@ -304,14 +344,16 @@ class TokenizerManager:
                 input_id_result = [] if obj.input_ids is None else None
                 for i in range(batch_size):
                     async for input_id in self._handle_single_request(
-                        obj, request, index=i, is_cache_for_prefill=True
+                        obj,
+                        request,
+                        index=i,
+                        input_id_index=i,
+                        is_cache_for_prefill=True,
                     ):
                         if input_id_result is not None:
                             input_id_result.append(input_id)
-                if input_id_result is not None and len(input_id_result) > 1:
+                if input_id_result is not None:
                     obj.input_ids = input_id_result
-                elif input_id_result is not None:
-                    obj.input_ids = input_id_result[0]
         else:
             parallel_sample_num = 1
@@ -325,58 +367,10 @@ class TokenizerManager:
                 if parallel_sample_num != 1:
                     # Here when using parallel sampling we should consider prefill stage so the index is :  j + i * (parallel_sample_num-1) + batch_size - 1
                     index += batch_size - 1 - i
-                rid = obj.rid[index]
-                if parallel_sample_num == 1:
-                    ## select operation
-                    if obj.input_ids is None:
-                        input_text = obj.text[i]
-                        input_ids = self.tokenizer.encode(obj.text[i])
-                    else:
-                        input_text = None
-                        input_ids = obj.input_ids[i]
-                else:
-                    assert obj.input_ids is not None
-                    if batch_size == 1:
-                        input_text = None
-                        input_ids = obj.input_ids
-                    else:
-                        input_text = None
-                        input_ids = obj.input_ids[i]
-                sampling_params = self._get_sampling_params(obj.sampling_params[index])
-                if self.is_generation:
-                    pixel_values, image_hashes, image_sizes = (
-                        await self._get_pixel_values(obj.image_data[index])
-                    )
-                    modalities = obj.modalities
-                    tokenized_obj = TokenizedGenerateReqInput(
-                        rid,
-                        input_text,
-                        input_ids,
-                        pixel_values,
-                        image_hashes,
-                        image_sizes,
-                        sampling_params,
-                        obj.return_logprob[index],
-                        obj.logprob_start_len[index],
-                        obj.top_logprobs_num[index],
-                        obj.stream,
-                        modalities,
-                        (
-                            obj.lora_path[index]
-                            if isinstance(obj.lora_path, list)
-                            else obj.lora_path
-                        ),
-                    )
-                else:
-                    tokenized_obj = TokenizedEmbeddingReqInput(
-                        rid,
-                        input_text,
-                        input_ids,
-                        sampling_params,
-                    )
-                self.send_to_controller.send_pyobj(tokenized_obj)
+                rid, _ = await self._send_single_request(
+                    obj, index, input_id_index=i, is_cache_for_prefill=False
+                )
                 event = asyncio.Event()
                 state = ReqState([], False, event)
@@ -399,7 +393,7 @@ class TokenizerManager:
         tasks = [asyncio.create_task(gen.__anext__()) for gen in generators]
         output_list = [None] * len(tasks)
-        # Recv results
+        # Fetch results
         while tasks:
             done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
@@ -441,7 +435,7 @@ class TokenizerManager:
     async def _wait_for_response(
         self,
         state: ReqState,
-        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        obj: Union[GenerateReqInput, EmbeddingReqInput, RewardReqInput],
         rid: str,
         request: Optional[fastapi.Request] = None,
         index: Optional[int] = None,
@@ -468,7 +462,7 @@ class TokenizerManager:
                     ),
                     obj.return_text_in_logprobs,
                 )
-            else:  # isinstance(obj, EmbeddingReqInput)
+            else:  # isinstance(obj, (EmbeddingReqInput, RewardReqInput))
                 out = state.out_list[-1]
             out["index"] = response_index
@@ -509,14 +503,14 @@ class TokenizerManager:
     def flush_cache(self):
         req = FlushCacheReq()
-        self.send_to_controller.send_pyobj(req)
+        self.send_to_scheduler.send_pyobj(req)
     def abort_request(self, rid: str):
         if rid not in self.rid_to_state:
             return
         del self.rid_to_state[rid]
         req = AbortReq(rid)
-        self.send_to_controller.send_pyobj(req)
+        self.send_to_scheduler.send_pyobj(req)
     async def update_weights(
         self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
@@ -532,8 +526,8 @@ class TokenizerManager:
             async with self.model_update_lock:
                 # wait for the previous generation requests to finish
                 while len(self.rid_to_state) > 0:
-                    await asyncio.sleep(0)
-                self.send_to_controller.send_pyobj(obj)
+                    await asyncio.sleep(0.001)
+                self.send_to_scheduler.send_pyobj(obj)
                 self.model_update_result = asyncio.Future()
                 result = await self.model_update_result
                 if result.success:
@@ -644,6 +638,7 @@ class TokenizerManager:
     def detokenize_logprob_tokens(
         self, token_logprobs: List[Tuple[float, int]], decode_to_text: bool
     ):
+        # TODO(lianmin): This should run on DetokenizerManager
         if not decode_to_text:
             return [(logprob, token_id, None) for logprob, token_id in token_logprobs]
@@ -664,125 +659,3 @@ class TokenizerManager:
                     token_top_logprobs, decode_to_text
                 )
         return top_logprobs
-    async def _get_pixel_values(self, image_data: List[Union[str, bytes]]):
-        if not image_data:
-            return None, None, None
-        aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
-        grid_pinpoints = (
-            self.hf_config.image_grid_pinpoints
-            if hasattr(self.hf_config, "image_grid_pinpoints")
-            and "anyres" in aspect_ratio
-            else None
-        )
-        if isinstance(image_data, list) and len(image_data) > 0:
-            # Multiple images
-            if len(image_data) > 1:
-                aspect_ratio = "pad"  # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
-                pixel_values, image_hashes, image_sizes = [], [], []
-                for img_data in image_data:
-                    pixel_v, image_h, image_s = await self._process_single_image(
-                        img_data, aspect_ratio, grid_pinpoints
-                    )
-                    pixel_values.append(pixel_v)
-                    image_hashes.append(image_h)
-                    image_sizes.append(image_s)
-                if isinstance(pixel_values[0], np.ndarray):
-                    pixel_values = np.stack(pixel_values, axis=0)
-            else:
-                # A single image
-                pixel_values, image_hash, image_size = await self._process_single_image(
-                    image_data[0], aspect_ratio, grid_pinpoints
-                )
-                image_hashes = [image_hash]
-                image_sizes = [image_size]
-        elif isinstance(image_data, str):
-            # A single image
-            pixel_values, image_hash, image_size = await self._process_single_image(
-                image_data, aspect_ratio, grid_pinpoints
-            )
-            image_hashes = [image_hash]
-            image_sizes = [image_size]
-        else:
-            raise ValueError(f"Invalid image data: {image_data}")
-        return pixel_values, image_hashes, image_sizes
-    async def _process_single_image(
-        self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
-    ):
-        if self.executor is not None:
-            loop = asyncio.get_event_loop()
-            return await loop.run_in_executor(
-                self.executor,
-                _process_single_image_task,
-                image_data,
-                aspect_ratio,
-                grid_pinpoints,
-            )
-        else:
-            return _process_single_image_task(
-                image_data, aspect_ratio, grid_pinpoints, self.processor
-            )
-global global_processor
-def init_global_processor(server_args: ServerArgs):
-    """Init the global processor for multi modal models."""
-    global global_processor
-    transformers.logging.set_verbosity_error()
-    global_processor = get_processor(
-        server_args.tokenizer_path,
-        tokenizer_mode=server_args.tokenizer_mode,
-        trust_remote_code=server_args.trust_remote_code,
-    )
-def _process_single_image_task(
-    image_data: Union[str, bytes],
-    image_aspect_ratio: Optional[str] = None,
-    image_grid_pinpoints: Optional[str] = None,
-    processor=None,
-):
-    try:
-        processor = processor or global_processor
-        image, image_size = load_image(image_data)
-        if image_size is not None:
-            # It is a video with multiple images
-            image_hash = hash(image_data)
-            pixel_values = processor.image_processor(image)["pixel_values"]
-            for _ in range(len(pixel_values)):
-                pixel_values[_] = pixel_values[_].astype(np.float16)
-            pixel_values = np.stack(pixel_values, axis=0)
-            return pixel_values, image_hash, image_size
-        else:
-            # It is an image
-            image_hash = hash(image_data)
-            if image_aspect_ratio == "pad":
-                image = expand2square(
-                    image,
-                    tuple(int(x * 255) for x in processor.image_processor.image_mean),
-                )
-                pixel_values = processor.image_processor(image.convert("RGB"))[
-                    "pixel_values"
-                ][0]
-            elif image_aspect_ratio == "anyres" or (
-                image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio
-            ):
-                pixel_values = process_anyres_image(
-                    image, processor.image_processor, image_grid_pinpoints
-                )
-            else:
-                pixel_values = processor.image_processor(image)["pixel_values"][0]
-            if isinstance(pixel_values, np.ndarray):
-                pixel_values = pixel_values.astype(np.float16)
-            return pixel_values, image_hash, image.size
-    except Exception:
-        logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())

sglang 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl

sglang 0.3.1.post3py3-none-any.whl → 0.3.3py3-none-any.whl