PyPI - sglang - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl - Mend

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

sglang/bench_latency.py +2 -1
sglang/lang/chat_template.py +17 -0
sglang/launch_server_llavavid.py +1 -1
sglang/srt/configs/__init__.py +3 -0
sglang/srt/configs/model_config.py +27 -2
sglang/srt/configs/qwen2vl.py +133 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/conversation.py +27 -0
sglang/srt/hf_transformers_utils.py +16 -1
sglang/srt/layers/attention/__init__.py +16 -5
sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
sglang/srt/layers/attention/flashinfer_backend.py +174 -54
sglang/srt/layers/attention/triton_backend.py +22 -6
sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
sglang/srt/layers/linear.py +89 -63
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/rotary_embedding.py +112 -0
sglang/srt/layers/sampler.py +51 -39
sglang/srt/lora/lora.py +3 -1
sglang/srt/managers/data_parallel_controller.py +1 -1
sglang/srt/managers/detokenizer_manager.py +4 -0
sglang/srt/managers/image_processor.py +186 -13
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/schedule_batch.py +238 -68
sglang/srt/managers/scheduler.py +69 -50
sglang/srt/managers/tokenizer_manager.py +24 -4
sglang/srt/managers/tp_worker.py +26 -111
sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
sglang/srt/mem_cache/memory_pool.py +56 -10
sglang/srt/mem_cache/radix_cache.py +4 -3
sglang/srt/model_executor/cuda_graph_runner.py +87 -28
sglang/srt/model_executor/forward_batch_info.py +83 -3
sglang/srt/model_executor/model_runner.py +32 -11
sglang/srt/models/chatglm.py +3 -3
sglang/srt/models/deepseek_v2.py +2 -2
sglang/srt/models/mllama.py +1004 -0
sglang/srt/models/qwen2_vl.py +724 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +13 -3
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +12 -0
sglang/srt/server_args.py +10 -0
sglang/srt/utils.py +22 -0
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +20 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +100 -3
sglang/version.py +1 -1
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/image_processor.py CHANGED Viewed

@@ -33,26 +33,32 @@ def init_global_processor(server_args: ServerArgs):
 class BaseImageProcessor(ABC):
+    def __init__(self, hf_config, server_args, _processor):
+        self.hf_config = hf_config
+        self._processor = _processor
+        self.executor = concurrent.futures.ProcessPoolExecutor(
+            initializer=init_global_processor,
+            mp_context=mp.get_context("fork"),
+            initargs=(server_args,),
+            max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
+        )
     @abstractmethod
-    async def process_images_async(self, image_data, **kwargs):
+    async def process_images_async(self, image_data, input_text, **kwargs):
         pass
 class DummyImageProcessor(BaseImageProcessor):
+    def __init__(self):
+        pass
     async def process_images_async(self, *args, **kwargs):
         return None
 class LlavaImageProcessor(BaseImageProcessor):
-    def __init__(self, hf_config, server_args, _image_processor):
-        self.hf_config = hf_config
-        self._image_processor = _image_processor
-        self.executor = concurrent.futures.ProcessPoolExecutor(
-            initializer=init_global_processor,
-            mp_context=mp.get_context("fork"),
-            initargs=(server_args,),
-            max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
-        )
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
     @staticmethod
     def _process_single_image_task(
@@ -119,7 +125,7 @@ class LlavaImageProcessor(BaseImageProcessor):
             )
     async def process_images_async(
-        self, image_data: List[Union[str, bytes]], request_obj
+        self, image_data: List[Union[str, bytes]], input_text, request_obj
     ):
         if not image_data:
             return None
@@ -177,10 +183,177 @@ class LlavaImageProcessor(BaseImageProcessor):
         }
+class MllamaImageProcessor(BaseImageProcessor):
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+    @staticmethod
+    def _process_single_image_task(images, input_text):
+        # input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
+        return global_processor(images, input_text, return_tensors="pt")
+    async def _process_single_image(self, images, input_text):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            image_inputs = await loop.run_in_executor(
+                self.executor,
+                MllamaImageProcessor._process_single_image_task,
+                images,
+                input_text,
+            )
+        else:
+            image_inputs = self._processor(images, input_text, return_tensors="pt")
+        return image_inputs
+    async def process_images_async(
+        self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
+    ):
+        if not image_data:
+            return None
+        if isinstance(input_text, list):
+            assert len(input_text) and isinstance(input_text[0], int)
+            input_text = self._processor.tokenizer.decode(input_text)
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        if len(image_data) > 0:
+            images = [load_image(image)[0] for image in image_data]
+        else:
+            images = load_image(image_data[0])[0]
+        image_inputs = await self._process_single_image(images, input_text)
+        image_inputs["image_hashes"] = [hash(str(image_data))]
+        image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
+        return image_inputs
+class Qwen2VLImageProcessor(BaseImageProcessor):
+    def __init__(self, hf_config, server_args, _image_processor):
+        self.hf_config = hf_config
+        self._image_processor = _image_processor
+        self.executor = concurrent.futures.ProcessPoolExecutor(
+            initializer=init_global_processor,
+            mp_context=mp.get_context("fork"),
+            initargs=(server_args,),
+            max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
+        )
+    @staticmethod
+    def _process_single_image_task(
+        image_data: Union[str, bytes],
+        image_processor=None,
+    ):
+        image_processor = image_processor or global_processor.image_processor
+        try:
+            image, image_size = load_image(image_data)
+            if image_size is not None:
+                # It is a video with multiple images
+                image_hash = hash(image_data)
+                process_result = image_processor(image)
+                pixel_values, image_grid_thws = (
+                    process_result["pixel_values"],
+                    process_result["image_grid_thw"][0],
+                )
+                for _ in range(len(pixel_values)):
+                    pixel_values[_] = pixel_values[_].astype(np.float16)
+                pixel_values = np.stack(pixel_values, axis=0)
+                image_grid_thws = np.stack(image_grid_thws, axis=0)
+                return pixel_values, image_hash, image_size, image_grid_thws
+            else:
+                # It is an image
+                image_hash = hash(image_data)
+                process_result = image_processor(image)
+                pixel_values, image_grid_thws = (
+                    process_result["pixel_values"],
+                    process_result["image_grid_thw"][0],
+                )
+                if isinstance(pixel_values, np.ndarray):
+                    pixel_values = pixel_values.astype(np.float16)
+                return pixel_values, image_hash, image.size, image_grid_thws
+        except Exception:
+            logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
+    async def _process_single_image(self, image_data: Union[bytes, str]):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(
+                self.executor,
+                Qwen2VLImageProcessor._process_single_image_task,
+                image_data,
+            )
+        else:
+            return self._process_single_image_task(image_data)
+    async def process_images_async(
+        self, image_data: List[Union[str, bytes]], input_text, request_obj
+    ):
+        if not image_data:
+            return None
+        if isinstance(image_data, list) and len(image_data) > 0:
+            # Multiple images
+            if len(image_data) > 1:
+                pixel_values, image_hashes, image_sizes, image_grid_thws = (
+                    [],
+                    [],
+                    [],
+                    [],
+                )
+                res = []
+                for img_data in image_data:
+                    res.append(self._process_single_image(img_data))
+                res = await asyncio.gather(*res)
+                for pixel_v, image_h, image_s, image_thw in res:
+                    pixel_values.append(pixel_v)
+                    image_hashes.append(image_h)
+                    image_sizes.append(image_s)
+                    image_grid_thws.append(image_thw)
+                if isinstance(pixel_values[0], np.ndarray):
+                    pixel_values = np.concatenate(pixel_values, axis=0)
+            else:
+                # A single image
+                pixel_values, image_hash, image_size, image_grid_thw = (
+                    await self._process_single_image(image_data[0])
+                )
+                image_hashes = [image_hash]
+                image_sizes = [image_size]
+                image_grid_thws = [image_grid_thw]
+        elif isinstance(image_data, str):
+            # A single image
+            pixel_values, image_hash, image_size, image_grid_thw = (
+                await self._process_single_image(image_data)
+            )
+            image_hashes = [image_hash]
+            image_sizes = [image_size]
+            image_grid_thws = [image_grid_thw]
+        else:
+            raise ValueError(f"Invalid image data: {image_data}")
+        return {
+            "pixel_values": pixel_values,
+            "image_hashes": image_hashes,
+            "image_sizes": image_sizes,
+            "modalities": request_obj.modalities,
+            "image_grid_thws": image_grid_thws,
+        }
 def get_image_processor(
-    hf_config, server_args: ServerArgs, _image_processor
+    hf_config, server_args: ServerArgs, processor
 ) -> BaseImageProcessor:
-    return LlavaImageProcessor(hf_config, server_args, _image_processor)
+    if "MllamaForConditionalGeneration" in hf_config.architectures:
+        return MllamaImageProcessor(hf_config, server_args, processor)
+    elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
+        return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
+    else:
+        return LlavaImageProcessor(hf_config, server_args, processor.image_processor)
 def get_dummy_image_processor():

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -353,3 +353,13 @@ class AbortReq:
 class ProfileReq(Enum):
     START_PROFILE = 1
     STOP_PROFILE = 2
+@dataclass
+class GetMemPoolSizeReq:
+    pass
+@dataclass
+class GetMemPoolSizeReqOutput:
+    size: int

sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl