PyPI - sglang - Versions diffs - 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +49 -7
sglang/lang/chat_template.py +24 -0
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/model_config.py +5 -0
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/conversation.py +29 -4
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/layers/attention/flashattention_backend.py +678 -83
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_native.py +5 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +49 -3
sglang/srt/layers/quantization/__init__.py +5 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8.py +3 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/moe_wna16.py +503 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_int8.py +2 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/rotary_embedding.py +63 -12
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +7 -26
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -128
sglang/srt/managers/scheduler.py +4 -4
sglang/srt/managers/tokenizer_manager.py +1 -1
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +8 -6
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +60 -57
sglang/srt/model_loader/loader.py +8 -0
sglang/srt/models/clip.py +12 -7
sglang/srt/models/deepseek_janus_pro.py +10 -15
sglang/srt/models/deepseek_v2.py +212 -121
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_mm.py +14 -80
sglang/srt/models/llama.py +16 -5
sglang/srt/models/llama4.py +420 -0
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/mllama4.py +154 -0
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +18 -6
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +99 -14
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +93 -24
sglang/srt/utils.py +104 -51
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +13 -26
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0

sglang/srt/speculative/eagle_worker.py CHANGED Viewed

@@ -11,7 +11,11 @@ from sglang.srt.distributed import GroupCoordinator, patch_tensor_parallel_group
 from sglang.srt.layers.dp_attention import disable_dp_size
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import get_token_ids_logprobs, get_top_logprobs
-from sglang.srt.managers.schedule_batch import ScheduleBatch
+from sglang.srt.managers.schedule_batch import (
+    ScheduleBatch,
+    get_last_loc,
+    global_server_args_dict,
+)
 from sglang.srt.managers.tp_worker import TpModelWorker
 from sglang.srt.model_executor.forward_batch_info import (
     CaptureHiddenMode,
@@ -67,6 +71,7 @@ class EAGLEWorker(TpModelWorker):
         self.gpu_id = gpu_id
         self.device = server_args.device
         self.target_worker = target_worker
+        self.page_size = server_args.page_size
         self.speculative_algorithm = SpeculativeAlgorithm.from_string(
             server_args.speculative_algorithm
         )
@@ -145,15 +150,26 @@ class EAGLEWorker(TpModelWorker):
     def init_attention_backend(self):
         # Create multi-step attn backends and cuda graph runners
         if self.server_args.attention_backend == "flashinfer":
-            from sglang.srt.layers.attention.flashinfer_backend import (
-                FlashInferMultiStepDraftBackend,
-            )
+            if not global_server_args_dict["use_mla_backend"]:
+                from sglang.srt.layers.attention.flashinfer_backend import (
+                    FlashInferMultiStepDraftBackend,
+                )
-            self.draft_attn_backend = FlashInferMultiStepDraftBackend(
-                self.draft_model_runner,
-                self.topk,
-                self.speculative_num_steps,
-            )
+                self.draft_attn_backend = FlashInferMultiStepDraftBackend(
+                    self.draft_model_runner,
+                    self.topk,
+                    self.speculative_num_steps,
+                )
+            else:
+                from sglang.srt.layers.attention.flashinfer_mla_backend import (
+                    FlashInferMLAMultiStepDraftBackend,
+                )
+                self.draft_attn_backend = FlashInferMLAMultiStepDraftBackend(
+                    self.draft_model_runner,
+                    self.topk,
+                    self.speculative_num_steps,
+                )
             self.draft_extend_attn_backend = None
             self.padded_static_len = self.speculative_num_steps + 1
             self.has_prefill_wrapper_verify = True
@@ -170,19 +186,19 @@ class EAGLEWorker(TpModelWorker):
             self.draft_extend_attn_backend = None
             self.padded_static_len = self.speculative_num_steps + 1
             self.has_prefill_wrapper_verify = False
-        elif self.server_args.attention_backend == "flashinfer_mla":
-            from sglang.srt.layers.attention.flashinfer_mla_backend import (
-                FlashInferMLAMultiStepDraftBackend,
+        elif self.server_args.attention_backend == "fa3":
+            from sglang.srt.layers.attention.flashattention_backend import (
+                FlashAttentionMultiStepBackend,
             )
-            self.draft_attn_backend = FlashInferMLAMultiStepDraftBackend(
+            self.draft_attn_backend = FlashAttentionMultiStepBackend(
                 self.draft_model_runner,
                 self.topk,
                 self.speculative_num_steps,
             )
             self.draft_extend_attn_backend = None
             self.padded_static_len = self.speculative_num_steps + 1
-            self.has_prefill_wrapper_verify = True
+            self.has_prefill_wrapper_verify = False
         else:
             raise ValueError(
                 f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
@@ -234,14 +250,11 @@ class EAGLEWorker(TpModelWorker):
         """
         if batch.forward_mode.is_decode():
             with self.draft_tp_context(self.draft_model_runner.tp_group):
-                spec_info, to_free_cache_loc = self.draft(batch)
+                spec_info = self.draft(batch)
             logits_output, verify_output, model_worker_batch = self.verify(
                 batch, spec_info
             )
-            # Free cache loc (we put it here to avoid synchronization and hide kernel launch overhead.)
-            self.token_to_kv_pool_allocator.free(to_free_cache_loc)
             # If it is None, it means all requests are finished
             if batch.spec_info.verified_id is not None:
                 with self.draft_tp_context(self.draft_model_runner.tp_group):
@@ -305,9 +318,59 @@ class EAGLEWorker(TpModelWorker):
             )
         # Allocate cache locations
-        out_cache_loc = batch.alloc_token_slots(
-            num_seqs * self.topk * self.speculative_num_steps
-        )
+        if self.page_size == 1:
+            out_cache_loc, token_to_kv_pool_state_backup = batch.alloc_token_slots(
+                num_seqs * self.topk * self.speculative_num_steps, backup_state=True
+            )
+        else:
+            if self.topk == 1:
+                prefix_lens = batch.seq_lens
+                seq_lens = prefix_lens + self.speculative_num_steps
+                extend_num_tokens = num_seqs * self.speculative_num_steps
+            else:
+                # In this case, the last partial page needs to be duplicated.
+                # KV cache layout in batch.req_to_token_pool.req_to_token:
+                #
+                # | -------- | -- xxxx .. | -- xxxx .. | -- xxxx .. |
+                #    prefix     top-k = 0    tok-k = 1    top-k = 2
+                #
+                #  "-" means prefix tokens
+                #  "x" means speculative draft tokens
+                #  "." means padded tokens
+                # TODO: fuse these ops
+                prefix_lens = batch.seq_lens
+                last_page_lens = prefix_lens % self.page_size
+                num_new_pages = (
+                    last_page_lens + self.speculative_num_steps + self.page_size - 1
+                ) // self.page_size
+                seq_lens = (
+                    prefix_lens // self.page_size * self.page_size
+                    + num_new_pages * (self.page_size * self.topk)
+                )
+                extend_num_tokens = torch.sum(seq_lens - prefix_lens).item()
+                raise NotImplementedError(
+                    "page_size > 1 and top_k > 1 are not supported."
+                )
+                # TODO: Support page_size > 1 and top_k > 1
+                # 1. Duplicate the KV cache in the last partial page for all top-k segments
+                # 2. Modify generate_draft_decode_kv_indices accordingly
+            last_loc = get_last_loc(
+                batch.req_to_token_pool.req_to_token,
+                batch.req_pool_indices,
+                prefix_lens,
+            )
+            out_cache_loc, token_to_kv_pool_state_backup = (
+                batch.alloc_paged_token_slots_extend(
+                    prefix_lens,
+                    seq_lens,
+                    last_loc,
+                    extend_num_tokens,
+                    backup_state=True,
+                )
+            )
         assign_draft_cache_locs[(num_seqs,)](
             batch.req_pool_indices,
             batch.req_to_token_pool.req_to_token,
@@ -316,6 +379,7 @@ class EAGLEWorker(TpModelWorker):
             batch.req_to_token_pool.req_to_token.shape[1],
             self.topk,
             self.speculative_num_steps,
+            self.page_size,
         )
         batch.out_cache_loc = out_cache_loc
         batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
@@ -343,6 +407,8 @@ class EAGLEWorker(TpModelWorker):
             # Run forward steps
             score_list, token_list, parents_list = self.draft_forward(forward_batch)
+        self.token_to_kv_pool_allocator.restore_state(token_to_kv_pool_state_backup)
         ret = EagleVerifyInput.create(
             spec_info.verified_id,
             score_list,
@@ -354,7 +420,7 @@ class EAGLEWorker(TpModelWorker):
             self.speculative_num_steps,
             self.server_args.speculative_num_draft_tokens,
         )
-        return ret, out_cache_loc
+        return ret
     def draft_forward(self, forward_batch: ForwardBatch):
         # Parse args
@@ -411,7 +477,7 @@ class EAGLEWorker(TpModelWorker):
         return score_list, token_list, parents_list
     def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
-        spec_info.prepare_for_verify(batch)
+        spec_info.prepare_for_verify(batch, self.page_size)
         batch.forward_mode = ForwardMode.TARGET_VERIFY
         batch.spec_info = spec_info
         model_worker_batch = batch.get_model_worker_batch()
@@ -421,7 +487,10 @@ class EAGLEWorker(TpModelWorker):
         self._detect_nan_if_needed(logits_output)
         spec_info.hidden_states = logits_output.hidden_states
         res: EagleVerifyOutput = spec_info.verify(
-            batch, logits_output, self.token_to_kv_pool_allocator
+            batch,
+            logits_output,
+            self.token_to_kv_pool_allocator,
+            self.page_size,
         )
         # Post process based on verified outputs.

sglang/srt/utils.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # limitations under the License.
 # ==============================================================================
 """Common utilities."""
 import base64
 import builtins
 import ctypes
@@ -35,8 +34,10 @@ import sys
 import tempfile
 import threading
 import time
+import traceback
 import warnings
 from contextlib import contextmanager
+from enum import Enum
 from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from importlib.util import find_spec
@@ -53,6 +54,7 @@ import torch.distributed
 import torch.distributed as dist
 import triton
 import zmq
+from decord import VideoReader, cpu
 from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
 from PIL import Image
@@ -261,7 +263,7 @@ def get_available_gpu_memory(device, gpu_id, distributed=False, empty_cache=True
     When distributed is True, the available memory is the minimum available memory of all GPUs.
     """
     if device == "cuda":
-        num_gpus = cuda_device_count_stateless()
+        num_gpus = torch.cuda.device_count()
         assert gpu_id < num_gpus
         if torch.cuda.current_device() != gpu_id:
@@ -512,13 +514,18 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
     import soundfile as sf
     from scipy.signal import resample
-    # print(f"loading {audio_file}")
     # Load audio data
     if isinstance(audio_file, bytes):
         audio, original_sr = sf.read(BytesIO(audio_file))
     elif audio_file.startswith("data:"):
         audio_file = audio_file.split(",")[1]
         audio, original_sr = sf.read(BytesIO(base64.b64decode(audio_file)))
+    elif audio_file.startswith("http://") or audio_file.startswith("https://"):
+        timeout = int(os.getenv("REQUEST_TIMEOUT", "5"))
+        response = requests.get(audio_file, stream=True, timeout=timeout)
+        audio_file = BytesIO(response.content)
+        response.close()
+        audio, original_sr = sf.read(audio_file)
     elif isinstance(audio_file, str):
         audio, original_sr = sf.read(audio_file)
     else:
@@ -536,10 +543,38 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
     return audio
-def load_image(image_file: Union[str, bytes]) -> tuple[Image, tuple[int, int]]:
-    image = image_size = None
+def encode_video(video_path, frame_count_limit=None):
+    if not os.path.exists(video_path):
+        logger.error(f"Video {video_path} does not exist")
+        return []
+    if frame_count_limit == 0:
+        return []
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_indices = [i for i in range(0, len(vr), sample_fps)]
+    if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
+        frame_indices = uniform_sample(frame_indices, frame_count_limit)
-    if isinstance(image_file, bytes):
+    frames = vr.get_batch(frame_indices).asnumpy()
+    frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+    return frames
+def load_image(
+    image_file: Union[Image.Image, str, bytes]
+) -> tuple[Image.Image, tuple[int, int]]:
+    image = image_size = None
+    if isinstance(image_file, Image.Image):
+        image = image_file
+        image_size = (image.width, image.height)
+    elif isinstance(image_file, bytes):
         image = Image.open(BytesIO(image_file))
     elif image_file.startswith("http://") or image_file.startswith("https://"):
         timeout = int(os.getenv("REQUEST_TIMEOUT", "3"))
@@ -563,6 +598,10 @@ def load_image(image_file: Union[str, bytes]) -> tuple[Image, tuple[int, int]]:
 def suppress_other_loggers():
+    warnings.filterwarnings(
+        "ignore", category=UserWarning, message="The given NumPy array is not writable"
+    )
     try:
         from vllm.logger import logger as vllm_default_logger
     except ImportError:
@@ -577,10 +616,6 @@ def suppress_other_loggers():
     )
     logging.getLogger("vllm.config").setLevel(logging.ERROR)
-    warnings.filterwarnings(
-        "ignore", category=UserWarning, message="The given NumPy array is not writable"
-    )
 def assert_pkg_version(pkg: str, min_version: str, message: str):
     try:
@@ -1381,47 +1416,6 @@ def disable_request_logging() -> bool:
     return get_bool_env_var("SGLANG_DISABLE_REQUEST_LOGGING")
-@lru_cache(maxsize=8)
-def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) -> int:
-    # Note: cuda_visible_devices is not used, but we keep it as an argument for
-    # LRU Cache purposes.
-    # Code below is based on
-    # https://github.com/pytorch/pytorch/blob/
-    # c1cd946818442aca8c7f812b16d187ce1586c3bc/
-    # torch/cuda/__init__.py#L831C1-L831C17
-    import torch.version
-    if not torch.cuda._is_compiled():
-        return 0
-    if is_hip():
-        # ROCm uses amdsmi instead of nvml for stateless device count
-        # This requires a sufficiently modern version of Torch 2.4.0
-        raw_count = (
-            torch.cuda._device_count_amdsmi()
-            if (hasattr(torch.cuda, "_device_count_amdsmi"))
-            else -1
-        )
-    else:
-        raw_count = torch.cuda._device_count_nvml()
-    r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count
-    return r
-# Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/utils.py
-def cuda_device_count_stateless() -> int:
-    """Get number of CUDA devices, caching based on the value of
-    CUDA_VISIBLE_DEVICES at the time of call.
-    This should be used instead of torch.cuda.device_count()
-    unless CUDA_VISIBLE_DEVICES has already been set to the desired
-    value."""
-    # This can be removed and simply replaced with torch.cuda.get_device_count
-    # after https://github.com/pytorch/pytorch/pull/122815 is released.
-    return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES", None))
 def dataclass_to_string_truncated(
     data, max_length=2048, skip_names: Optional[Set[str]] = None
 ):
@@ -1766,3 +1760,62 @@ def parse_connector_type(url: str) -> str:
         return ""
     return m.group(1)
+def retry(
+    fn,
+    max_retry: int,
+    initial_delay: float = 2.0,
+    max_delay: float = 60.0,
+    should_retry: Callable[[Any], bool] = lambda e: True,
+):
+    for try_index in itertools.count():
+        try:
+            return fn()
+        except Exception as e:
+            if try_index >= max_retry:
+                raise Exception(f"retry() exceed maximum number of retries.")
+            if not should_retry(e):
+                raise Exception(f"retry() observe errors that should not be retried.")
+            delay = min(initial_delay * (2**try_index), max_delay) * (
+                0.75 + 0.25 * random.random()
+            )
+            logger.warning(
+                f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
+            )
+            traceback.print_exc()
+            time.sleep(delay)
+def flatten_nested_list(nested_list):
+    if isinstance(nested_list, list):
+        return [
+            item for sublist in nested_list for item in flatten_nested_list(sublist)
+        ]
+    else:
+        return [nested_list]
+class DeepEPMode(Enum):
+    normal = "normal"
+    low_latency = "low_latency"
+    auto = "auto"
+    def enable_normal(self):
+        return self in [DeepEPMode.normal, DeepEPMode.auto]
+    def enable_low_latency(self):
+        return self in [DeepEPMode.low_latency, DeepEPMode.auto]
+    def resolve(self, forward_mode):
+        if self != DeepEPMode.auto:
+            return self
+        if forward_mode.is_decode():
+            return DeepEPMode.low_latency
+        else:
+            return DeepEPMode.normal

sglang/test/test_custom_ops.py CHANGED Viewed

@@ -82,6 +82,61 @@ if is_cuda:
             dequantize_per_token(ref_y, scale, dtype),
         )
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+    def test_scaled_fp8_quant_with_padding(dtype) -> None:
+        original_rows = 5
+        x = (torch.randn(size=(original_rows, 16), device="cuda") * 13).to(dtype)
+        padding_size = 10
+        # Test with dynamic quantization
+        y_dynamic, scale_dynamic = scaled_fp8_quant(
+            x, None, num_token_padding=padding_size
+        )
+        # Verify output shape has the padded size
+        assert y_dynamic.shape[0] == padding_size
+        assert y_dynamic.shape[1] == x.shape[1]
+        # Verify that the actual data in the non-padded region is correctly quantized
+        y_without_padding, scale_without_padding = scaled_fp8_quant(x, None)
+        torch.testing.assert_close(y_dynamic[:original_rows], y_without_padding)
+        # Test with static quantization
+        # First get a scale
+        _, scale = scaled_fp8_quant(x, None)
+        # Then use it for static quantization with padding
+        y_static, _ = scaled_fp8_quant(x, scale, num_token_padding=padding_size)
+        # Verify output shape has the padded size
+        assert y_static.shape[0] == padding_size
+        assert y_static.shape[1] == x.shape[1]
+        # Verify that the actual data in the non-padded region is correctly quantized
+        y_static_without_padding, _ = scaled_fp8_quant(x, scale)
+        torch.testing.assert_close(y_static[:original_rows], y_static_without_padding)
+        # Test with per-token dynamic quantization
+        y_per_token, scale_per_token = scaled_fp8_quant(
+            x, None, num_token_padding=padding_size, use_per_token_if_dynamic=True
+        )
+        # Verify output shape has the padded size
+        assert y_per_token.shape[0] == padding_size
+        assert y_per_token.shape[1] == x.shape[1]
+        # Verify that the actual data in the non-padded region is correctly quantized
+        y_per_token_without_padding, scale_per_token_without_padding = scaled_fp8_quant(
+            x, None, use_per_token_if_dynamic=True
+        )
+        torch.testing.assert_close(
+            y_per_token[:original_rows], y_per_token_without_padding
+        )
+        torch.testing.assert_close(
+            scale_per_token[:original_rows], scale_per_token_without_padding
+        )
 if __name__ == "__main__":
     # Run the specific test function directly

sglang/test/test_utils.py CHANGED Viewed

@@ -25,7 +25,7 @@ from sglang.bench_serving import run_benchmark
 from sglang.global_config import global_config
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
-from sglang.srt.utils import get_bool_env_var, kill_process_tree
+from sglang.srt.utils import get_bool_env_var, kill_process_tree, retry
 from sglang.test.run_eval import run_eval
 from sglang.utils import get_exception_traceback
@@ -76,11 +76,14 @@ def is_in_ci():
 if is_in_ci():
-    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
-    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+        5000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+    )
 else:
-    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
-    DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
+    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = (
+        7000 + int(os.environ.get("CUDA_VISIBLE_DEVICES", "0")[0]) * 100
+    )
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
 def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -1010,26 +1013,10 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
 class CustomTestCase(unittest.TestCase):
     def _callTestMethod(self, method):
-        _retry_execution(
-            lambda: super(CustomTestCase, self)._callTestMethod(method),
-            max_retry=_get_max_retry(),
+        max_retry = int(
+            os.environ.get("SGLANG_TEST_MAX_RETRY", "1" if is_in_ci() else "0")
         )
-def _get_max_retry():
-    return int(os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0"))
-def _retry_execution(fn, max_retry: int):
-    if max_retry == 0:
-        fn()
-        return
-    try:
-        fn()
-    except Exception as e:
-        print(
-            f"retry_execution failed once and will retry. This may be an error or a flaky test. Error: {e}"
+        retry(
+            lambda: super(CustomTestCase, self)._callTestMethod(method),
+            max_retry=max_retry,
         )
-        traceback.print_exc()
-        _retry_execution(fn, max_retry=max_retry - 1)

sglang/utils.py CHANGED Viewed

@@ -25,8 +25,6 @@ from IPython.display import HTML, display
 from pydantic import BaseModel
 from tqdm import tqdm
-from sglang.srt.utils import kill_process_tree
 logger = logging.getLogger(__name__)
@@ -422,6 +420,8 @@ def terminate_process(process):
     """
     Terminate the process and automatically release the reserved port.
     """
+    from sglang.srt.utils import kill_process_tree
     kill_process_tree(process.pid)
     lock_socket = process_socket_map.pop(process, None)

sglang/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.~~4.post3~~"
1	+ __version__ = "0.4.5"

{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.4.post3
+Version: 0.4.5
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -234,18 +234,19 @@ Requires-Dist: pillow; extra == "runtime-common"
 Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
 Requires-Dist: psutil; extra == "runtime-common"
 Requires-Dist: pydantic; extra == "runtime-common"
+Requires-Dist: pynvml; extra == "runtime-common"
 Requires-Dist: python-multipart; extra == "runtime-common"
 Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
-Requires-Dist: transformers==4.50.0; extra == "runtime-common"
+Requires-Dist: transformers==4.51.0; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.5.post4; extra == "srt"
+Requires-Dist: sgl-kernel==0.0.8; extra == "srt"
 Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
 Requires-Dist: torch==2.5.1; extra == "srt"
 Requires-Dist: cuda-python; extra == "srt"

sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl