PyPI - sglang - Versions diffs - 0.4.2.post4__py3-none-any.whl → 0.4.3.post1__py3-none-any.whl - Mend

sglang 0.4.2.post4py3-none-any.whl → 0.4.3.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/srt/managers/image_processor.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # TODO: also move pad_input_ids into this module
 import asyncio
 import concurrent.futures
+import dataclasses
 import logging
 import multiprocessing as mp
 import os
@@ -8,6 +9,7 @@ from abc import ABC, abstractmethod
 from typing import List, Optional, Union
 import numpy as np
+import PIL
 import transformers
 from decord import VideoReader, cpu
 from PIL import Image
@@ -34,11 +36,22 @@ def init_global_processor(server_args: ServerArgs):
     )
+@dataclasses.dataclass
+class BaseImageProcessorOutput:
+    image_hashes: list[int]
+    image_sizes: list[int]
+    all_frames: [PIL.Image]
+    # input_text, with each frame of video/image represented with a image_token
+    input_text: str
 class BaseImageProcessor(ABC):
     def __init__(self, hf_config, server_args, _processor):
         self.hf_config = hf_config
         self._processor = _processor
         self.server_args = server_args
+        # FIXME: not accurate, model and image specific
+        self.NUM_TOKEN_PER_FRAME = 330
         self.executor = concurrent.futures.ProcessPoolExecutor(
             initializer=init_global_processor,
@@ -48,9 +61,128 @@ class BaseImageProcessor(ABC):
         )
     @abstractmethod
-    async def process_images_async(self, image_data, input_text, **kwargs):
+    async def process_images_async(
+        self, image_data, input_text, max_req_input_len, **kwargs
+    ):
         pass
+    def get_estimated_frames_list(self, image_data):
+        """
+        estimate the total frame count from all visual input
+        """
+        # Before processing inputs
+        estimated_frames_list = []
+        for image in image_data:
+            if isinstance(image, str) and image.startswith("video:"):
+                path = image[len("video:") :]
+                # Estimate frames for the video
+                vr = VideoReader(path, ctx=cpu(0))
+                num_frames = len(vr)
+            else:
+                # For images, each contributes one frame
+                num_frames = 1
+            estimated_frames_list.append(num_frames)
+        return estimated_frames_list
+    def encode_video(self, video_path, frame_count_limit=None):
+        if not os.path.exists(video_path):
+            logger.error(f"Video {video_path} does not exist")
+            return []
+        if frame_count_limit == 0:
+            return []
+        def uniform_sample(l, n):
+            gap = len(l) / n
+            idxs = [int(i * gap + gap / 2) for i in range(n)]
+            return [l[i] for i in idxs]
+        vr = VideoReader(video_path, ctx=cpu(0))
+        sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+        frame_idx = [i for i in range(0, len(vr), sample_fps)]
+        if frame_count_limit is not None and len(frame_idx) > frame_count_limit:
+            frame_idx = uniform_sample(frame_idx, frame_count_limit)
+        frames = vr.get_batch(frame_idx).asnumpy()
+        frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+        return frames
+    def load_images(
+        self,
+        max_req_input_len: int,
+        input_ids: list,
+        image_data,
+        image_token: str,
+    ) -> BaseImageProcessorOutput:
+        """
+        Each frame of video/image will be replaced by a single image token
+        """
+        image_hashes, image_sizes = [], []
+        all_frames = []
+        new_text_parts = []
+        if isinstance(input_ids, list):
+            assert len(input_ids) and isinstance(input_ids[0], int)
+            input_text = self._processor.tokenizer.decode(input_ids)
+        else:
+            input_text = input_ids
+        text_parts = input_text.split(image_token)
+        # roughly calculate the max number of frames under the max_req_input_len limit
+        def calculate_max_num_frames() -> int:
+            ret = (max_req_input_len - len(input_ids)) // self.NUM_TOKEN_PER_FRAME
+            return min(ret, 100)
+        MAX_NUM_FRAMES = calculate_max_num_frames()
+        estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
+        total_frame_count = sum(estimated_frames_list)
+        # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
+        # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
+        scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
+        # Process each input with allocated frames
+        for image_index, (image, estimated_frames) in enumerate(
+            zip(image_data, estimated_frames_list)
+        ):
+            if len(all_frames) >= MAX_NUM_FRAMES:
+                frames_to_process = 0
+            else:
+                frames_to_process = max(1, int(estimated_frames * scaling_factor))
+            if frames_to_process == 0:
+                frames = []
+            else:
+                try:
+                    if isinstance(image, str) and image.startswith("video:"):
+                        path = image[len("video:") :]
+                        frames = self.encode_video(
+                            path, frame_count_limit=frames_to_process
+                        )
+                    else:
+                        raw_image, _size = load_image(image)
+                        frames = [raw_image]
+                    if len(frames) == 0:
+                        continue
+                except FileNotFoundError as e:
+                    print(e)
+                    return None
+                image_sizes += frames[0].size * len(frames)
+                image_hashes += [hash(image)] * len(frames)
+                all_frames += frames
+            new_text_parts.append(text_parts[image_index])
+            if frames_to_process != 0:
+                new_text_parts.append(image_token * len(frames))
+            assert frames_to_process == len(frames)
+        new_text_parts.append(text_parts[-1])
+        input_text = "".join(new_text_parts)
+        return BaseImageProcessorOutput(
+            image_hashes, image_sizes, all_frames, input_text
+        )
 class DummyImageProcessor(BaseImageProcessor):
     def __init__(self):
@@ -248,9 +380,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             text=input_text, images=images, return_tensors="pt"
         )
         return {
-            "input_ids": result["input_ids"],
-            "pixel_values": result["pixel_values"],
-            "tgt_sizes": result["tgt_sizes"],
+            "input_ids": result.input_ids,
+            "pixel_values": result.pixel_values,
+            "tgt_sizes": result.tgt_sizes,
         }
     async def _process_images(self, images, input_text):
@@ -278,124 +410,20 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
     ):
         if not image_data:
             return None
         if not isinstance(image_data, list):
             image_data = [image_data]
-        image_hashes, image_sizes = [], []
-        all_frames = []
-        # roughly calculate the max number of frames under the max_req_input_len limit
-        def calculate_max_num_frames() -> int:
-            # Model-specific
-            NUM_TOKEN_PER_FRAME = 330
-            ret = (max_req_input_len - len(input_ids)) // NUM_TOKEN_PER_FRAME
-            return min(ret, 100)
-        MAX_NUM_FRAMES = calculate_max_num_frames()
-        # print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
-        def get_estimated_frames_list():
-            """
-            estimate the total frame count from all visual input
-            """
-            # Before processing inputs
-            estimated_frames_list = []
-            for image in image_data:
-                if isinstance(image, str) and image.startswith("video:"):
-                    path = image[len("video:") :]
-                    # Estimate frames for the video
-                    vr = VideoReader(path, ctx=cpu(0))
-                    num_frames = len(vr)
-                else:
-                    # For images, each contributes one frame
-                    num_frames = 1
-                estimated_frames_list.append(num_frames)
-            return estimated_frames_list
-        estimated_frames_list = get_estimated_frames_list()
-        total_frame_count = sum(estimated_frames_list)
-        scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
-        def encode_video(video_path, frame_count_limit=None):
-            if not os.path.exists(video_path):
-                logger.error(f"Video {video_path} does not exist")
-                return []
-            if frame_count_limit == 0:
-                return []
-            def uniform_sample(l, n):
-                gap = len(l) / n
-                idxs = [int(i * gap + gap / 2) for i in range(n)]
-                return [l[i] for i in idxs]
-            vr = VideoReader(video_path, ctx=cpu(0))
-            sample_fps = round(vr.get_avg_fps() / 1)  # FPS
-            frame_idx = [i for i in range(0, len(vr), sample_fps)]
-            if frame_count_limit is not None and len(frame_idx) > frame_count_limit:
-                frame_idx = uniform_sample(frame_idx, frame_count_limit)
-            frames = vr.get_batch(frame_idx).asnumpy()
-            frames = [Image.fromarray(v.astype("uint8")) for v in frames]
-            return frames
-        if isinstance(input_ids, list):
-            assert len(input_ids) and isinstance(input_ids[0], int)
-            input_text = self._processor.tokenizer.decode(input_ids)
-        else:
-            input_text = input_ids
-        # MiniCPMV requires each frame of video as a single image token
-        text_parts = input_text.split(self.IMAGE_TOKEN)
-        new_text_parts = []
-        # Process each input with allocated frames
-        for image_index, (image, estimated_frames) in enumerate(
-            zip(image_data, estimated_frames_list)
-        ):
-            if len(all_frames) >= MAX_NUM_FRAMES:
-                frames_to_process = 0
-            else:
-                frames_to_process = max(1, int(estimated_frames * scaling_factor))
-            if frames_to_process == 0:
-                frames = []
-            else:
-                try:
-                    if isinstance(image, str) and image.startswith("video:"):
-                        path = image[len("video:") :]
-                        frames = encode_video(path, frame_count_limit=frames_to_process)
-                    else:
-                        raw_image, _size = load_image(image)
-                        frames = [raw_image]
-                    if len(frames) == 0:
-                        continue
-                except FileNotFoundError as e:
-                    print(e)
-                    return None
-                image_sizes += frames[0].size * len(frames)
-                image_hashes += [hash(image)] * len(frames)
-                all_frames += frames
-            assert frames_to_process == len(frames)
-            new_text_parts.append(text_parts[image_index])
-            if frames_to_process != 0:
-                new_text_parts.append(self.IMAGE_TOKEN * len(frames))
-        new_text_parts.append(text_parts[-1])
-        input_text = "".join(new_text_parts)
+        base_output = self.load_images(
+            max_req_input_len, input_ids, image_data, self.IMAGE_TOKEN
+        )
+        if base_output is None:
+            return None
-        if len(all_frames) == 0:
+        if len(base_output.all_frames) == 0:
             return None
-        res = await self._process_images(images=all_frames, input_text=input_text)
-        pixel_values = res["pixel_values"]
-        tgt_sizes = res["tgt_sizes"]
-        input_ids = res["input_ids"]
+        res = await self._process_images(
+            images=base_output.all_frames, input_text=base_output.input_text
+        )
         # Collect special token ids
         tokenizer = self._processor.tokenizer
@@ -405,10 +433,10 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             slice_start_id = [tokenizer.slice_start_id]
             slice_end_id = [tokenizer.slice_end_id]
         return {
-            "input_ids": input_ids.flatten().tolist(),
-            "pixel_values": pixel_values,
-            "tgt_sizes": tgt_sizes,
-            "image_hashes": image_hashes,
+            "input_ids": res["input_ids"].flatten().tolist(),
+            "pixel_values": res["pixel_values"],
+            "tgt_sizes": res["tgt_sizes"],
+            "image_hashes": base_output.image_hashes,
             "modalities": request_obj.modalities or ["image"],
             "im_start_id": im_start_id,
             "im_end_id": im_end_id,
@@ -536,13 +564,80 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
         }
+class Qwen2_5VLImageProcessor(BaseImageProcessor):
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
+        self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
+        self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
+        self.NUM_TOKEN_PER_FRAME = 770
+    @staticmethod
+    def _process_images_task(images, input_text):
+        result = global_processor.__call__(
+            text=input_text, images=images, return_tensors="pt"
+        )
+        return {
+            "input_ids": result.input_ids,
+            "pixel_values": result.pixel_values,
+            "image_grid_thws": result.image_grid_thw,
+        }
+    async def _process_images(self, images, input_text) -> dict:
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(
+                self.executor,
+                Qwen2_5VLImageProcessor._process_images_task,
+                images,
+                input_text,
+            )
+        else:
+            return self._process_images_task(images, input_text)
+    async def process_images_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_ids,
+        request_obj,
+        max_req_input_len,
+        *args,
+        **kwargs,
+    ):
+        if not image_data:
+            return None
+        if isinstance(image_data, str):
+            image_data = [image_data]
+        image_token = self.IMAGE_TOKEN
+        base_output = self.load_images(
+            max_req_input_len, input_ids, image_data, image_token
+        )
+        ret = await self._process_images(base_output.all_frames, base_output.input_text)
+        return {
+            "input_ids": ret["input_ids"].flatten().tolist(),
+            "pixel_values": ret["pixel_values"],
+            "image_hashes": base_output.image_hashes,
+            "modalities": request_obj.modalities or ["image"],
+            "image_grid_thws": ret["image_grid_thws"],
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+        }
 def get_image_processor(
     hf_config, server_args: ServerArgs, processor
 ) -> BaseImageProcessor:
     if "MllamaForConditionalGeneration" in hf_config.architectures:
         return MllamaImageProcessor(hf_config, server_args, processor)
     elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
-        return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
+        return Qwen2VLImageProcessor(hf_config, server_args, processor)
+    elif "Qwen2_5_VLForConditionalGeneration" in hf_config.architectures:
+        return Qwen2_5VLImageProcessor(hf_config, server_args, processor)
     elif "MiniCPMV" in hf_config.architectures:
         return MiniCPMVImageProcessor(hf_config, server_args, processor)
     else:

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -371,6 +371,8 @@ class BatchTokenIDOut:
     output_top_logprobs_val: List[List]
     output_top_logprobs_idx: List[List]
+    output_hidden_states: List[List[float]]
 @dataclass
 class BatchStrOut:
@@ -397,6 +399,8 @@ class BatchStrOut:
     output_top_logprobs_val: List[List]
     output_top_logprobs_idx: List[List]
+    output_hidden_states: List[List[float]]
 @dataclass
 class BatchEmbeddingOut:

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -65,6 +65,7 @@ global_server_args_dict = {
     "enable_dp_attention": ServerArgs.enable_dp_attention,
     "enable_ep_moe": ServerArgs.enable_ep_moe,
     "device": ServerArgs.device,
+    "enable_flashinfer_mla": ServerArgs.enable_flashinfer_mla,
 }
 logger = logging.getLogger(__name__)
@@ -315,6 +316,7 @@ class Req:
             self.output_token_logprobs_val = self.output_token_logprobs_idx = (
                 self.output_top_logprobs_val
             ) = self.output_top_logprobs_idx = None
+        self.hidden_states = []
         # Logprobs (internal values)
         # The tokens is prefilled but need to be considered as decode tokens
@@ -604,6 +606,9 @@ class ScheduleBatch:
     # Enable custom logit processor
     enable_custom_logit_processor: bool = False
+    # Return hidden states
+    return_hidden_states: bool = False
     @classmethod
     def init_new(
         cls,
@@ -615,6 +620,7 @@ class ScheduleBatch:
         enable_overlap: bool,
         spec_algorithm: SpeculativeAlgorithm,
         enable_custom_logit_processor: bool,
+        return_hidden_states: bool = False,
     ):
         return cls(
             reqs=reqs,
@@ -629,6 +635,7 @@ class ScheduleBatch:
             device=req_to_token_pool.device,
             spec_algorithm=spec_algorithm,
             enable_custom_logit_processor=enable_custom_logit_processor,
+            return_hidden_states=return_hidden_states,
         )
     def batch_size(self):
@@ -1196,9 +1203,15 @@ class ScheduleBatch:
             spec_algorithm=self.spec_algorithm,
             spec_info=self.spec_info,
             capture_hidden_mode=(
-                getattr(self.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL)
-                if self.spec_info
-                else CaptureHiddenMode.NULL
+                CaptureHiddenMode.FULL
+                if self.return_hidden_states
+                else (
+                    getattr(
+                        self.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL
+                    )
+                    if self.spec_info
+                    else CaptureHiddenMode.NULL
+                )
             ),
         )

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -997,6 +997,7 @@ class Scheduler:
             self.enable_overlap,
             self.spec_algorithm,
             self.server_args.enable_custom_logit_processor,
+            self.server_args.return_hidden_states,
         )
         new_batch.prepare_for_extend()
@@ -1156,6 +1157,8 @@ class Scheduler:
                         logits_output.input_token_logprobs.tolist()
                     )
+            hidden_state_offset = 0
             # Check finish conditions
             logprob_pt = 0
             for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
@@ -1182,6 +1185,21 @@ class Scheduler:
                             i, req, logprob_pt, next_token_ids, logits_output
                         )
+                    if (
+                        self.server_args.return_hidden_states
+                        and logits_output.hidden_states is not None
+                    ):
+                        req.hidden_states.append(
+                            logits_output.hidden_states[
+                                hidden_state_offset : (
+                                    hidden_state_offset := hidden_state_offset
+                                    + len(req.origin_input_ids)
+                                )
+                            ]
+                            .cpu()
+                            .clone()
+                        )
                     if req.grammar is not None:
                         req.grammar.accept_token(next_token_id)
                         req.grammar.finished = req.finished()
@@ -1275,6 +1293,12 @@ class Scheduler:
                         logits_output.next_token_top_logprobs_idx[i]
                     )
+            if (
+                self.server_args.return_hidden_states
+                and logits_output.hidden_states is not None
+            ):
+                req.hidden_states.append(logits_output.hidden_states[i].cpu().clone())
             if req.grammar is not None:
                 req.grammar.accept_token(next_token_id)
                 req.grammar.finished = req.finished()
@@ -1398,6 +1422,7 @@ class Scheduler:
             completion_tokens = []
             cached_tokens = []
             spec_verify_ct = []
+            hidden_states = []
             if return_logprob:
                 input_token_logprobs_val = []
@@ -1464,6 +1489,8 @@ class Scheduler:
                         output_top_logprobs_val.append(req.output_top_logprobs_val)
                         output_top_logprobs_idx.append(req.output_top_logprobs_idx)
+                    hidden_states.append(req.hidden_states)
             # Send to detokenizer
             if rids:
                 self.send_to_detokenizer.send_pyobj(
@@ -1490,6 +1517,7 @@ class Scheduler:
                         input_top_logprobs_idx,
                         output_top_logprobs_val,
                         output_top_logprobs_idx,
+                        hidden_states,
                     )
                 )
         else:  # embedding or reward model
@@ -1553,6 +1581,7 @@ class Scheduler:
             self.enable_overlap,
             self.spec_algorithm,
             self.server_args.enable_custom_logit_processor,
+            self.server_args.return_hidden_states,
         )
         idle_batch.prepare_for_idle()
         return idle_batch

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -796,6 +796,12 @@ class TokenizerManager:
                     }
                 )
+            if (
+                hasattr(recv_obj, "output_hidden_states")
+                and len(recv_obj.output_hidden_states[i]) > 0
+            ):
+                meta_info["hidden_states"] = recv_obj.output_hidden_states[i]
             if isinstance(recv_obj, BatchStrOut):
                 out_dict = {
                     "text": recv_obj.output_strs[i],

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -156,6 +156,10 @@ class TpModelWorkerClient:
                     logits_output.input_token_logprobs = (
                         logits_output.input_token_logprobs.to("cpu", non_blocking=True)
                     )
+            if logits_output.hidden_states is not None:
+                logits_output.hidden_states = logits_output.hidden_states.to(
+                    "cpu", non_blocking=True
+                )
             next_token_ids = next_token_ids.to("cpu", non_blocking=True)
             copy_done.record()

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -33,6 +33,9 @@ from sglang.srt.model_executor.forward_batch_info import (
     ForwardBatch,
     ForwardMode,
 )
+from sglang.srt.utils import is_hip
+is_hip_ = is_hip()
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
@@ -129,6 +132,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
         if bs <= model_runner.req_to_token_pool.size
         and bs <= server_args.cuda_graph_max_bs
     ]
+    if is_hip_:
+        capture_bs += [i * 8 for i in range(21, 33)]
     compile_bs = (
         [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
         if server_args.enable_torch_compile
@@ -349,7 +354,13 @@ class CudaGraphRunner:
             spec_algorithm=self.model_runner.spec_algorithm,
             spec_info=spec_info,
             capture_hidden_mode=(
-                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+                CaptureHiddenMode.FULL
+                if self.model_runner.server_args.return_hidden_states
+                else (
+                    spec_info.capture_hidden_mode
+                    if spec_info
+                    else CaptureHiddenMode.NULL
+                )
             ),
         )

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -263,7 +263,10 @@ class ForwardBatch:
             ret.extend_prefix_lens = torch.tensor(
                 batch.extend_prefix_lens, dtype=torch.int32
             ).to(device, non_blocking=True)
-            if model_runner.server_args.attention_backend != "torch_native":
+            if (
+                model_runner.server_args.attention_backend != "torch_native"
+                and model_runner.server_args.speculative_algorithm != "NEXTN"
+            ):
                 ret.extend_num_tokens = batch.extend_num_tokens
                 positions, ret.extend_start_loc = compute_position_triton(
                     ret.extend_prefix_lens, ret.extend_seq_lens, ret.extend_num_tokens

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -67,6 +67,7 @@ from sglang.srt.utils import (
     monkey_patch_p2p_access_check,
     monkey_patch_vllm_gguf_config,
     set_cpu_offload_max_bytes,
+    set_cuda_arch,
 )
 logger = logging.getLogger(__name__)
@@ -110,8 +111,14 @@ class ModelRunner:
         ):
             # TODO: add MLA optimization on CPU
             if self.server_args.device != "cpu":
-                logger.info("MLA optimization is turned on. Use triton backend.")
-                self.server_args.attention_backend = "triton"
+                if server_args.enable_flashinfer_mla:
+                    logger.info(
+                        "FlashInfer MLA optimization is turned on. Use flashinfer backend for DeepseekV3ForCausalLM."
+                    )
+                    self.server_args.attention_backend = "flashinfer"
+                else:
+                    logger.info("MLA optimization is turned on. Use triton backend.")
+                    self.server_args.attention_backend = "triton"
         if self.server_args.enable_double_sparsity:
             logger.info(
@@ -169,6 +176,7 @@ class ModelRunner:
                 "enable_dp_attention": server_args.enable_dp_attention,
                 "enable_ep_moe": server_args.enable_ep_moe,
                 "device": server_args.device,
+                "enable_flashinfer_mla": server_args.enable_flashinfer_mla,
             }
         )
@@ -292,6 +300,8 @@ class ModelRunner:
                 if torch.cuda.get_device_capability()[1] < 5:
                     raise RuntimeError("SGLang only supports sm75 and above.")
+        set_cuda_arch()
         # Prepare the model config
         self.load_config = LoadConfig(
             load_format=self.server_args.load_format,

sglang 0.4.2.post4__py3-none-any.whl → 0.4.3.post1__py3-none-any.whl

sglang 0.4.2.post4py3-none-any.whl → 0.4.3.post1py3-none-any.whl