PyPI - sglang - Versions diffs - 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl - Mend

sglang 0.4.9.post1py3-none-any.whl → 0.4.9.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

sglang/srt/configs/model_config.py +24 -1
sglang/srt/conversation.py +21 -2
sglang/srt/disaggregation/ascend/__init__.py +6 -0
sglang/srt/disaggregation/ascend/conn.py +44 -0
sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
sglang/srt/disaggregation/mooncake/conn.py +15 -14
sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
sglang/srt/disaggregation/utils.py +25 -3
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +1 -0
sglang/srt/entrypoints/openai/protocol.py +11 -0
sglang/srt/entrypoints/openai/serving_chat.py +7 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/kimik2_detector.py +220 -0
sglang/srt/hf_transformers_utils.py +18 -0
sglang/srt/jinja_template_utils.py +8 -0
sglang/srt/layers/communicator.py +17 -4
sglang/srt/layers/linear.py +12 -2
sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
sglang/srt/layers/moe/ep_moe/layer.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -2
sglang/srt/layers/moe/topk.py +8 -2
sglang/srt/layers/parameter.py +19 -3
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/moe_wna16.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +738 -14
sglang/srt/managers/io_struct.py +27 -2
sglang/srt/managers/mm_utils.py +55 -94
sglang/srt/managers/schedule_batch.py +16 -5
sglang/srt/managers/scheduler.py +21 -1
sglang/srt/managers/tokenizer_manager.py +16 -0
sglang/srt/mem_cache/memory_pool.py +65 -40
sglang/srt/model_executor/forward_batch_info.py +13 -1
sglang/srt/model_loader/loader.py +23 -12
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +62 -17
sglang/srt/models/deepseek_vl2.py +1 -1
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +6 -3
sglang/srt/models/internvl.py +8 -2
sglang/srt/models/kimi_vl.py +8 -2
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llava.py +3 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpmo.py +1 -2
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral_quant.py +4 -0
sglang/srt/models/mllama4.py +13 -4
sglang/srt/models/phi4mm.py +8 -2
sglang/srt/models/phimoe.py +553 -0
sglang/srt/models/qwen2.py +2 -0
sglang/srt/models/qwen2_5_vl.py +10 -7
sglang/srt/models/qwen2_vl.py +12 -1
sglang/srt/models/vila.py +8 -2
sglang/srt/multimodal/processors/base_processor.py +197 -137
sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
sglang/srt/multimodal/processors/gemma3.py +4 -2
sglang/srt/multimodal/processors/gemma3n.py +1 -1
sglang/srt/multimodal/processors/internvl.py +1 -1
sglang/srt/multimodal/processors/janus_pro.py +1 -1
sglang/srt/multimodal/processors/kimi_vl.py +1 -1
sglang/srt/multimodal/processors/minicpm.py +4 -3
sglang/srt/multimodal/processors/mllama4.py +1 -1
sglang/srt/multimodal/processors/phi4mm.py +1 -1
sglang/srt/multimodal/processors/pixtral.py +1 -1
sglang/srt/multimodal/processors/qwen_vl.py +203 -80
sglang/srt/multimodal/processors/vila.py +1 -1
sglang/srt/server_args.py +11 -4
sglang/srt/utils.py +154 -31
sglang/version.py +1 -1
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +4 -3
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +75 -70
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0

sglang/srt/multimodal/processors/pixtral.py CHANGED Viewed

@@ -106,7 +106,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
                     pixel_values=processor_output["pixel_values"],
                     image_sizes=processor_output["image_sizes"],
                     modality=Modality.IMAGE,
-                    image_offsets=image_offsets,
+                    offsets=image_offsets,
                 )
             ]

sglang/srt/multimodal/processors/qwen_vl.py CHANGED Viewed

@@ -1,9 +1,13 @@
 import asyncio
 import math
+import os
 import re
-from typing import Dict, List, Union
+from typing import List, Union
+import torch
+import torchvision
 from PIL import Image
+from torchvision.transforms import InterpolationMode
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
 from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
@@ -12,6 +16,185 @@ from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
 )
 from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
+from sglang.utils import logger
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+VIDEO_TOTAL_PIXELS = int(
+    float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
+)
+VIDEO_MIN_PIXELS = 128 * 28 * 28
+VIDEO_MAX_PIXELS = 768 * 28 * 28
+FRAME_FACTOR = 2
+FPS = 2.0
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 768
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = IMAGE_FACTOR,
+    min_pixels: int = MIN_PIXELS,
+    max_pixels: int = MAX_PIXELS,
+) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
+def resize_image(image, size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    width, height = image.size
+    min_pixels = MIN_PIXELS
+    max_pixels = MAX_PIXELS
+    resized_height, resized_width = smart_resize(
+        height,
+        width,
+        factor=size_factor,
+        min_pixels=min_pixels,
+        max_pixels=max_pixels,
+    )
+    image = image.resize((resized_width, resized_height))
+    return image
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+async def resize_image_async(image):
+    return resize_image(image)
+def smart_nframes(
+    ele: dict,
+    total_frames: int,
+    video_fps: int | float,
+) -> int:
+    """calculate the number of frames for video used for model inputs.
+    Args:
+        ele (dict): a dict contains the configuration of video.
+            support either `fps` or `nframes`:
+                - nframes: the number of frames to extract for model inputs.
+                - fps: the fps to extract frames for model inputs.
+                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
+                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
+        total_frames (int): the original total number of frames of the video.
+        video_fps (int | float): the original fps of the video.
+    Raises:
+        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
+    Returns:
+        int: the number of frames for video used for model inputs.
+    """
+    assert not (
+        "fps" in ele and "nframes" in ele
+    ), "Only accept either `fps` or `nframes`"
+    if "nframes" in ele:
+        nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
+    else:
+        fps = ele.get("fps", FPS)
+        min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
+        max_frames = floor_by_factor(
+            ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR
+        )
+        nframes = total_frames / video_fps * fps
+        if nframes > total_frames:
+            logger.warning(
+                f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]"
+            )
+        nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
+        nframes = floor_by_factor(nframes, FRAME_FACTOR)
+    if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
+        raise ValueError(
+            f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}."
+        )
+    return nframes
+# process video, qwen-specific
+async def preprocess_video(
+    vr,
+    image_factor: int = IMAGE_FACTOR,
+    # vr: VideoReader, image_factor: int = IMAGE_FACTOR
+) -> torch.Tensor:
+    ele = {}
+    total_frames, video_fps = len(vr), vr.get_avg_fps()
+    nframes = smart_nframes({}, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+    video = vr.get_batch(idx).asnumpy()
+    video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
+    nframes, _, height, width = video.shape
+    min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+    total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+    max_pixels = max(
+        min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR),
+        int(min_pixels * 1.05),
+    )
+    max_pixels_supposed = ele.get("max_pixels", max_pixels)
+    if max_pixels_supposed > max_pixels:
+        logger.warning(
+            f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}]."
+        )
+    max_pixels = min(max_pixels_supposed, max_pixels)
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=image_factor,
+        )
+    else:
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=image_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    video = torchvision.transforms.functional.resize(
+        video,
+        [resized_height, resized_width],
+        interpolation=InterpolationMode.BICUBIC,
+        antialias=True,
+    ).float()
+    return video
 # Compatible with Qwen2VL and Qwen2_5VL
@@ -37,104 +220,44 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         self.MIN_PIXELS = 4 * 28 * 28
         self.MAX_PIXELS = 16384 * 28 * 28
         self.MAX_RATIO = 200
+        # TODO(mick): move all MultimodalSpecialTokens initializations into processor init
+        self.mm_special_tokens = MultimodalSpecialTokens(
+            image_token=self.IMAGE_TOKEN,
+            image_token_regex=self.IMAGE_TOKEN_REGEX,
+            video_token=self.VIDEO_TOKEN_ID,
+        )
     async def process_mm_data_async(
         self,
-        image_data: List[Union[str, bytes, Dict]],
+        image_data: List[Union[str, bytes]],
         input_text,
         request_obj,
         max_req_input_len,
         *args,
         **kwargs,
     ):
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(
-                image_token=self.IMAGE_TOKEN,
-                image_token_regex=self.IMAGE_TOKEN_REGEX,
-            ),
+            video_data=request_obj.video_data,
+            multimodal_tokens=self.mm_special_tokens,
             max_req_input_len=max_req_input_len,
         )
-        def smart_resize(
-            height: int,
-            width: int,
-            factor: int = self.IMAGE_FACTOR,
-            min_pixels: int = self.MIN_PIXELS,
-            max_pixels: int = self.MAX_PIXELS,
-        ) -> tuple[int, int]:
-            """
-            Rescales the image so that the following conditions are met:
-            1. Both dimensions (height and width) are divisible by 'factor'.
-            2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-            3. The aspect ratio of the image is maintained as closely as possible.
-            """
-            if max(height, width) / min(height, width) > self.MAX_RATIO:
-                raise ValueError(
-                    f"absolute aspect ratio must be smaller than {self.MAX_RATIO}, got {max(height, width) / min(height, width)}"
-                )
-            h_bar = max(factor, round_by_factor(height, factor))
-            w_bar = max(factor, round_by_factor(width, factor))
-            if h_bar * w_bar > max_pixels:
-                beta = math.sqrt((height * width) / max_pixels)
-                h_bar = floor_by_factor(height / beta, factor)
-                w_bar = floor_by_factor(width / beta, factor)
-            elif h_bar * w_bar < min_pixels:
-                beta = math.sqrt(min_pixels / (height * width))
-                h_bar = ceil_by_factor(height * beta, factor)
-                w_bar = ceil_by_factor(width * beta, factor)
-            return h_bar, w_bar
-        def resize_image(image, size_factor: int = self.IMAGE_FACTOR) -> Image.Image:
-            width, height = image.size
-            min_pixels = self.MIN_PIXELS
-            max_pixels = self.MAX_PIXELS
-            resized_height, resized_width = smart_resize(
-                height,
-                width,
-                factor=size_factor,
-                min_pixels=min_pixels,
-                max_pixels=max_pixels,
-            )
-            image = image.resize((resized_width, resized_height))
-            return image
-        def round_by_factor(number: int, factor: int) -> int:
-            """Returns the closest integer to 'number' that is divisible by 'factor'."""
-            return round(number / factor) * factor
-        def ceil_by_factor(number: int, factor: int) -> int:
-            """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-            return math.ceil(number / factor) * factor
-        def floor_by_factor(number: int, factor: int) -> int:
-            """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-            return math.floor(number / factor) * factor
-        async def resize_image_async(image):
-            return resize_image(image)
         # Qwen-specific: resize images if they are raw Image objects
         if base_output.images and isinstance(base_output.images[0], Image.Image):
             resize_tasks = [resize_image_async(image) for image in base_output.images]
             base_output.images = await asyncio.gather(*resize_tasks)
-        video_grid_thw = None  # TODO
-        mm_items, input_ids = self.process_and_combine_mm_data(base_output)
-        if not mm_items:
-            # Note(Xinyuan): This is the case where image loading fails.
-            return None
+        if base_output.videos:
+            base_output.videos = [
+                await preprocess_video(video) for video in base_output.videos
+            ]
-        combined_mm_item = mm_items[0]  # only image is supported for now
-        video_grid_thw = None  # TODO
-        second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(base_output)
+        input_ids = input_ids.flatten()
         mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
             spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
             image_token_id=self.IM_TOKEN_ID,
@@ -145,9 +268,9 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
                 self.hf_config.vision_config, "tokens_per_second", None
             ),
             input_ids=input_ids.unsqueeze(0),
-            image_grid_thw=combined_mm_item.image_grid_thw,
-            video_grid_thw=video_grid_thw,
-            second_per_grid_ts=second_per_grid_ts,
+            image_grid_thw=getattr(ret, "image_grid_thw", None),
+            video_grid_thw=getattr(ret, "video_grid_thw", None),
+            second_per_grid_ts=getattr(ret, "second_per_grid_ts", None),
         )
         mrope_positions = mrope_positions.squeeze(1)

sglang/srt/multimodal/processors/vila.py CHANGED Viewed

@@ -57,7 +57,7 @@ class VILAMultimodalProcessor(BaseMultimodalProcessor):
             image_data=image_data,
         )
-        mm_items, input_ids = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
         return {
             "input_ids": input_ids.tolist(),

sglang/srt/server_args.py CHANGED Viewed

@@ -416,7 +416,7 @@ class ServerArgs:
         if self.enable_dp_lm_head:
             assert (
                 self.enable_dp_attention
-            ), "Please enable dp attention when setting enable_dp_attention. "
+            ), "Please enable dp attention when setting enable_dp_lm_head. "
         # DeepEP MoE
         if self.enable_deepep_moe:
@@ -1048,9 +1048,16 @@ class ServerArgs:
         parser.add_argument(
             "--tool-call-parser",
             type=str,
-            choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
+            choices=[
+                "qwen25",
+                "mistral",
+                "llama3",
+                "deepseekv3",
+                "pythonic",
+                "kimi_k2",
+            ],
             default=ServerArgs.tool_call_parser,
-            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
+            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'.",
         )
         # Data parallelism
@@ -1614,7 +1621,7 @@ class ServerArgs:
             "--disaggregation-transfer-backend",
             type=str,
             default=ServerArgs.disaggregation_transfer_backend,
-            choices=["mooncake", "nixl"],
+            choices=["mooncake", "nixl", "ascend"],
             help="The backend for disaggregation transfer. Default is mooncake.",
         )
         parser.add_argument(

sglang/srt/utils.py CHANGED Viewed

@@ -197,7 +197,7 @@ def get_int_env_var(name: str, default: int = 0) -> int:
 def support_triton(backend: str) -> bool:
-    return backend not in ["torch_native", "intel_amx"]
+    return backend not in ["torch_native", "intel_amx", "ascend"]
 try:
@@ -728,33 +728,6 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
     return audio
-def encode_video(video_path, frame_count_limit=None):
-    # Lazy import because decord is not available on some arm platforms.
-    from decord import VideoReader, cpu
-    if not os.path.exists(video_path):
-        logger.error(f"Video {video_path} does not exist")
-        return []
-    if frame_count_limit == 0:
-        return []
-    def uniform_sample(l, n):
-        gap = len(l) / n
-        idxs = [int(i * gap + gap / 2) for i in range(n)]
-        return [l[i] for i in idxs]
-    vr = VideoReader(video_path, ctx=cpu(0))
-    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
-    frame_indices = [i for i in range(0, len(vr), sample_fps)]
-    if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
-        frame_indices = uniform_sample(frame_indices, frame_count_limit)
-    frames = vr.get_batch(frame_indices).asnumpy()
-    frames = [Image.fromarray(v.astype("uint8")) for v in frames]
-    return frames
 def load_image(
     image_file: Union[Image.Image, str, bytes],
 ) -> tuple[Image.Image, tuple[int, int]]:
@@ -774,9 +747,6 @@ def load_image(
     elif image_file.startswith("data:"):
         image_file = image_file.split(",")[1]
         image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
-    elif image_file.startswith("video:"):
-        image_file = image_file.replace("video:", "")
-        image, image_size = decode_video_base64(image_file)
     elif isinstance(image_file, str):
         image = Image.open(BytesIO(pybase64.b64decode(image_file, validate=True)))
     else:
@@ -785,6 +755,61 @@ def load_image(
     return image, image_size
+def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
+    # We import decord here to avoid a strange Segmentation fault (core dumped) issue.
+    from decord import VideoReader, cpu, gpu
+    try:
+        from decord.bridge import decord_bridge
+        ctx = gpu(0)
+        _ = decord_bridge.get_ctx_device(ctx)
+    except Exception:
+        ctx = cpu(0)
+    tmp_file = None
+    vr = None
+    try:
+        if isinstance(video_file, bytes):
+            tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+            tmp_file.write(video_file)
+            tmp_file.close()
+            vr = VideoReader(tmp_file.name, ctx=ctx)
+        elif isinstance(video_file, str):
+            if video_file.startswith(("http://", "https://")):
+                timeout = int(os.getenv("REQUEST_TIMEOUT", "10"))
+                response = requests.get(video_file, stream=True, timeout=timeout)
+                response.raise_for_status()
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                for chunk in response.iter_content(chunk_size=8192):
+                    tmp_file.write(chunk)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+            elif video_file.startswith("data:"):
+                _, encoded = video_file.split(",", 1)
+                video_bytes = base64.b64decode(encoded)
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                tmp_file.write(video_bytes)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+            elif os.path.isfile(video_file):
+                vr = VideoReader(video_file, ctx=ctx)
+            else:
+                video_bytes = base64.b64decode(video_file)
+                tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+                tmp_file.write(video_bytes)
+                tmp_file.close()
+                vr = VideoReader(tmp_file.name, ctx=ctx)
+        else:
+            raise ValueError(f"Unsupported video input type: {type(video_file)}")
+        return vr
+    finally:
+        if tmp_file and os.path.exists(tmp_file.name):
+            os.unlink(tmp_file.name)
 def suppress_other_loggers():
     warnings.filterwarnings(
         "ignore", category=UserWarning, message="The given NumPy array is not writable"
@@ -2757,3 +2782,101 @@ def lru_cache_frozenset(maxsize=128):
         return wrapper
     return decorator
+def apply_module_patch(target_module, target_function, wrappers):
+    original_module, original_function = parse_module_path(
+        target_module, target_function, False
+    )
+    original_function_id = id(original_function)
+    candidate = original_function
+    for wrapper in wrappers:
+        candidate = wrapper(candidate)
+    if target_function is not None:
+        setattr(original_module, target_function, candidate)
+    for key, value in sys.modules.copy().items():
+        if (
+            target_function is not None
+            and hasattr(value, target_function)
+            and id(getattr(value, target_function)) == original_function_id
+        ):
+            setattr(value, target_function, candidate)
+def parse_module_path(module_path, function_name, create_dummy):
+    from importlib.machinery import ModuleSpec
+    def create_dummy_module(full_path, parent=None):
+        """Create and register a placeholder module"""
+        dummy = types.ModuleType(full_path)
+        dummy.__file__ = "vllm_ascend.dummy_module.py"
+        dummy.__spec__ = ModuleSpec(full_path, None)
+        sys.modules[full_path] = dummy
+        if parent:
+            setattr(parent, full_path.split(".")[-1], dummy)
+        return dummy
+    def create_placeholder_function(func_name):
+        """Create dummy function that raises when called"""
+        def placeholder(*args, **kwargs):
+            raise NotImplementedError(f"Function {func_name} is a placeholder")
+        placeholder.__name__ = func_name
+        return placeholder
+    modules = module_path.split(".")
+    current_module = None
+    processed_path = []
+    for idx, part in enumerate(modules):
+        current_path = ".".join(modules[: idx + 1])
+        parent_path = ".".join(modules[:idx]) if idx > 0 else None
+        try:
+            current_module = importlib.import_module(current_path)
+        except ModuleNotFoundError:
+            # Handle missing module
+            parent = importlib.import_module(parent_path) if parent_path else None
+            if parent and hasattr(parent, part):
+                # Use existing attribute from parent
+                current_module = getattr(parent, part)
+                # Check for early function resolution
+                if function_name and hasattr(current_module, function_name):
+                    return current_module, getattr(current_module, function_name)
+                if function_name and create_dummy:
+                    ph_func = create_placeholder_function(function_name)
+                    setattr(current_module, function_name, ph_func)
+                    return current_module, ph_func
+                if function_name:
+                    raise AttributeError(
+                        f"Function {function_name} missing in {current_path}"
+                    )
+            else:
+                if not create_dummy:
+                    raise
+                # Create and register dummy module
+                current_module = create_dummy_module(
+                    current_path,
+                    parent=(
+                        importlib.import_module(parent_path) if parent_path else None
+                    ),
+                )
+        processed_path.append(part)
+    # Final function handling
+    final_module = sys.modules[module_path]
+    if function_name is not None:
+        if not hasattr(final_module, function_name):
+            if create_dummy:
+                ph_func = create_placeholder_function(function_name)
+                setattr(final_module, function_name, ph_func)
+            else:
+                setattr(final_module, function_name, None)
+        return final_module, getattr(final_module, function_name)
+    return final_module, None

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.9.~~post1~~"
1	+ __version__ = "0.4.9.post2"

{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.9.post1
+Version: 0.4.9.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -242,6 +242,7 @@ Requires-Dist: pynvml; extra == "runtime-common"
 Requires-Dist: pybase64; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
+Requires-Dist: sentencepiece; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: scipy; extra == "runtime-common"
 Requires-Dist: torchao==0.9.0; extra == "runtime-common"
@@ -249,10 +250,10 @@ Requires-Dist: transformers==4.53.0; extra == "runtime-common"
 Requires-Dist: timm==1.0.16; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
-Requires-Dist: xgrammar==0.1.20; extra == "runtime-common"
+Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.2.4; extra == "srt"
+Requires-Dist: sgl-kernel==0.2.5; extra == "srt"
 Requires-Dist: torch==2.7.1; extra == "srt"
 Requires-Dist: torchaudio==2.7.1; extra == "srt"
 Requires-Dist: torchvision==0.22.1; extra == "srt"

sglang 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

sglang 0.4.9.post1py3-none-any.whl → 0.4.9.post2py3-none-any.whl