PyPI - sglang - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +73 -14
sglang/compile_deep_gemm.py +13 -7
sglang/launch_server.py +2 -0
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/compilation/backend.py +1 -1
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +30 -7
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -12
sglang/srt/entrypoints/engine.py +31 -20
sglang/srt/entrypoints/grpc_server.py +0 -1
sglang/srt/entrypoints/http_server.py +94 -94
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +23 -2
sglang/srt/eplb/expert_distribution.py +64 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/activation.py +6 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +19 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -1
sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
sglang/srt/layers/attention/utils.py +78 -0
sglang/srt/layers/communicator.py +24 -1
sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/layernorm.py +35 -6
sglang/srt/layers/logits_processor.py +9 -20
sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
sglang/srt/layers/moe/ep_moe/layer.py +78 -289
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +35 -10
sglang/srt/layers/moe/utils.py +3 -4
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +13 -84
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/awq.py +0 -3
sglang/srt/layers/quantization/base_config.py +7 -0
sglang/srt/layers/quantization/fp8.py +68 -63
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/quantization/mxfp4.py +30 -38
sglang/srt/layers/quantization/unquant.py +23 -45
sglang/srt/layers/quantization/w4afp8.py +38 -2
sglang/srt/layers/radix_attention.py +5 -2
sglang/srt/layers/rotary_embedding.py +130 -46
sglang/srt/layers/sampler.py +12 -1
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +29 -4
sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +185 -144
sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +165 -78
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/hicache_storage.py +7 -1
sglang/srt/mem_cache/memory_pool.py +253 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +55 -14
sglang/srt/model_executor/model_runner.py +77 -170
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +296 -78
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4_moe.py +322 -354
sglang/srt/models/glm4_moe_nextn.py +4 -14
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +29 -197
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/qwen2.py +23 -2
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +35 -5
sglang/srt/models/qwen3_moe.py +18 -12
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multimodal/processors/base_processor.py +1 -0
sglang/srt/multimodal/processors/glm4v.py +1 -1
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/parser/reasoning_parser.py +28 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +459 -199
sglang/srt/single_batch_overlap.py +2 -4
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +142 -74
sglang/srt/utils/hf_transformers_utils.py +38 -12
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_deterministic.py +235 -12
sglang/test/test_deterministic_utils.py +2 -1
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
sglang/srt/models/vila.py +0 -306
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/models/qwen3.py CHANGED Viewed

@@ -29,6 +29,7 @@ from sglang.srt.model_loader.weight_utils import (
 )
 from sglang.srt.models.qwen2 import Qwen2MLP as Qwen3MLP
 from sglang.srt.models.qwen2 import Qwen2Model
+from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import (
     add_prefix,
     get_cmo_stream,
@@ -88,8 +89,16 @@ class Qwen3Attention(nn.Module):
         self.max_position_embeddings = max_position_embeddings
         self.tp_rank = get_tensor_model_parallel_rank()
-        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
-        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        norm_kwargs = (
+            dict(
+                weight_dtype=torch.float32,
+                cast_x_before_out_mul=True,
+            )
+            if get_global_server_args().rl_on_policy_target == "fsdp"
+            else {}
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps, **norm_kwargs)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps, **norm_kwargs)
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -158,10 +167,18 @@ class Qwen3Attention(nn.Module):
         hidden_states: torch.Tensor,
         forward_batch: ForwardBatch,
     ) -> torch.Tensor:
+        if get_global_server_args().rl_on_policy_target == "fsdp":
+            hidden_states = hidden_states.bfloat16()
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self._apply_qk_norm(q, k)
         q, k = self.rotary_emb(positions, q, k)
+        if get_global_server_args().rl_on_policy_target == "fsdp":
+            q = q.to(torch.bfloat16)
+            k = k.to(torch.bfloat16)
         attn_output = self.attn(q, k, v, forward_batch)
         output, _ = self.o_proj(attn_output)
         return output
@@ -204,9 +221,22 @@ class Qwen3DecoderLayer(nn.Module):
             quant_config=quant_config,
             prefix=add_prefix("mlp", prefix),
         )
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        norm_kwargs = (
+            dict(
+                weight_dtype=torch.float32,
+                cast_x_before_out_mul=True,
+                override_orig_dtype=torch.float32,
+                fp32_residual=True,
+            )
+            if get_global_server_args().rl_on_policy_target == "fsdp"
+            else {}
+        )
+        self.input_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, **norm_kwargs
+        )
         self.post_attention_layernorm = RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
+            config.hidden_size, eps=config.rms_norm_eps, **norm_kwargs
         )
         self.layer_scatter_modes = LayerScatterModes.init_new(
@@ -331,7 +361,7 @@ class Qwen3ForCausalLM(nn.Module):
                 self.pp_group.send(
                     self.model.embed_tokens.weight, dst=self.pp_group.last_rank
                 )
-            else:
+            elif self.pp_group.is_last_rank:
                 emb_token_weight = self.pp_group.recv(
                     size=(config.vocab_size, config.hidden_size),
                     dtype=next(self.model.parameters()).dtype,

sglang/srt/models/qwen3_moe.py CHANGED Viewed

@@ -241,16 +241,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
                 )
     def op_experts(self, state):
-        state.hidden_states_experts_output = self.experts.run_moe_core(
+        state.combine_input = self.experts.run_moe_core(
             dispatch_output=state.dispatch_output,
         )
     def op_combine_a(self, state):
         if self.ep_size > 1:
             self.experts.dispatcher.combine_a(
-                hidden_states=state.pop("hidden_states_experts_output"),
-                topk_ids=state.dispatch_output.topk_ids,
-                topk_weights=state.dispatch_output.topk_weights,
+                combine_input=state.pop("combine_input"),
                 tbo_subbatch_index=state.get("tbo_subbatch_index"),
             )
             state.pop("dispatch_output")
@@ -539,10 +537,16 @@ class Qwen3MoeDecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         forward_batch: ForwardBatch,
         residual: Optional[torch.Tensor],
+        captured_last_layer_outputs: Optional[List[torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        hidden_states, residual = self.layer_communicator.prepare_attn(
-            hidden_states, residual, forward_batch
+        hidden_states, residual = (
+            self.layer_communicator.prepare_attn_and_capture_last_layer_outputs(
+                hidden_states,
+                residual,
+                forward_batch,
+                captured_last_layer_outputs=captured_last_layer_outputs,
+            )
         )
         if hidden_states.shape[0] != 0:
@@ -774,13 +778,15 @@ class Qwen3MoeForCausalLM(nn.Module):
         self.capture_aux_hidden_states = True
         if layer_ids is None:
             num_layers = self.config.num_hidden_layers
-            self.model.layers_to_capture = [
-                2,
-                num_layers // 2,
-                num_layers - 3,
-            ]  # Specific layers for EAGLE3 support
+            self.model.set_eagle3_layers_to_capture(
+                [
+                    2,
+                    num_layers // 2,
+                    num_layers - 3,
+                ]
+            )  # Specific layers for EAGLE3 support
         else:
-            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+            self.model.set_eagle3_layers_to_capture([val + 1 for val in layer_ids])
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/qwen3_next.py CHANGED Viewed

@@ -478,6 +478,13 @@ class Qwen3GatedDeltaNet(nn.Module):
         # reshape input data into 2D tensor
         core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
         z = z.reshape(-1, z.shape[-1])
+        # Add padding for DP-Attn
+        if is_dp_attention_enabled():
+            core_attn_out_pad = torch.zeros_like(z)
+            core_attn_out_pad[: core_attn_out.shape[0], :] = core_attn_out
+            core_attn_out = core_attn_out_pad
         core_attn_out = self.norm(core_attn_out, z)
         core_attn_out = core_attn_out.reshape(z_shape_og)
         core_attn_out = core_attn_out.reshape(*core_attn_out.shape[:-2], -1)

sglang/srt/multimodal/customized_mm_processor_utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+from typing import Dict, Type
+from transformers import PretrainedConfig, ProcessorMixin
+# Useful for registering a custom processor different from Hugging Face's default.
+_CUSTOMIZED_MM_PROCESSOR: Dict[str, Type[ProcessorMixin]] = dict()
+def register_customized_processor(
+    processor_class: Type[ProcessorMixin],
+):
+    """Class decorator that maps a config class's model_type field to a customized processor class.
+    Args:
+        processor_class: A processor class that inherits from ProcessorMixin
+    Example:
+        ```python
+        @register_customized_processor(MyCustomProcessor)
+        class MyModelConfig(PretrainedConfig):
+            model_type = "my_model"
+        ```
+    """
+    def decorator(config_class: PretrainedConfig):
+        if not hasattr(config_class, "model_type"):
+            raise ValueError(
+                f"Class {config_class.__name__} with register_customized_processor should "
+                f"have a 'model_type' class attribute."
+            )
+        _CUSTOMIZED_MM_PROCESSOR[config_class.model_type] = processor_class
+        return config_class
+    return decorator

sglang/srt/multimodal/processors/base_processor.py CHANGED Viewed

@@ -185,6 +185,7 @@ class BaseMultimodalProcessor(ABC):
             "aspect_ratio_mask": Modality.IMAGE,
             "num_patches": Modality.IMAGE,
             "patch_pixel_values": Modality.IMAGE,
+            "block_sizes": Modality.IMAGE,
             # Audio-related attributes
             "audio_features": Modality.AUDIO,
             "audio_feature_lens": Modality.AUDIO,

sglang/srt/multimodal/processors/glm4v.py CHANGED Viewed

@@ -17,7 +17,7 @@ class Glm4vImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         super().__init__(hf_config, server_args, _processor, *args, **kwargs)
-        # GLM-4.1V and GLM-4.5V specific tokens
+        # GLM-V specific tokens
         self.IMAGE_TOKEN = "<|image|>"
         self.VIDEO_TOKEN = "<|video|>"
         self.IMAGE_START_TOKEN = "<|begin_of_image|>"

sglang/srt/multimodal/processors/{vila.py → nvila.py} RENAMED Viewed

@@ -1,64 +1,72 @@
-from typing import Any, Dict, List, Optional, Type
+from typing import Any
 import torch.nn as nn
 from transformers.configuration_utils import PretrainedConfig
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from sglang.srt.managers.io_struct import (
-    EmbeddingReqInput,
-    GenerateReqInput,
-    ImageDataInputItem,
-)
-from sglang.srt.models.vila import VILAForConditionalGeneration
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.models.nvila import NVILAForConditionalGeneration
+from sglang.srt.models.nvila_lite import NVILALiteForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
     MultimodalSpecialTokens,
 )
 from sglang.srt.server_args import ServerArgs
+NUM_VIDEO_FRAMES = 8
-class VILAProcessor(ProcessorMixin):
-    """A stub class for the VILA processor."""
-    tokenizer: PreTrainedTokenizerBase
-class VILAMultimodalProcessor(BaseMultimodalProcessor):
-    models: List[Type[nn.Module]] = [VILAForConditionalGeneration]
-    _processor: VILAProcessor
+class NVILAMultimodalProcessor(BaseMultimodalProcessor):
+    models: list[type[nn.Module]] = [
+        NVILAForConditionalGeneration,
+        NVILALiteForConditionalGeneration,
+    ]
     def __init__(
         self,
         hf_config: PretrainedConfig,
         server_args: ServerArgs,
-        _processor: VILAProcessor,
+        _processor: ProcessorMixin,
         *args,
         **kwargs,
     ) -> None:
         super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        self._processor: ProcessorMixin
+        tokenizer: PreTrainedTokenizerBase = getattr(self._processor, "tokenizer")
         self.mm_tokens = MultimodalSpecialTokens(
-            image_token=self._processor.tokenizer.image_token,
+            image_token=tokenizer.image_token,
             image_token_id=hf_config.image_token_id,
+            video_token=tokenizer.video_token,
             video_token_id=hf_config.video_token_id,
         ).build(_processor)
     async def process_mm_data_async(
         self,
-        image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
-        input_text: str | List[int],
-        request_obj: GenerateReqInput | EmbeddingReqInput,
+        image_data,
+        audio_data,
+        input_text,
+        request_obj: GenerateReqInput,
         **kwargs,
-    ) -> Optional[Dict[str, Any]]:
+    ) -> dict[str, Any] | None:
         base_output = self.load_mm_data(
             prompt=input_text,
             multimodal_tokens=self.mm_tokens,
-            image_data=image_data,
+            image_data=request_obj.image_data,  # type: ignore
+            video_data=request_obj.video_data,  # type: ignore
         )
+        for i, video in enumerate(base_output.videos):  # type: ignore
+            base_output.videos[i] = [x.asnumpy() for x in video]  # type: ignore
         mm_items, input_ids, _ = self.process_and_combine_mm_data(
-            base_output, self.mm_tokens
+            base_output,
+            self.mm_tokens,
+            do_sample_frames=True,
+            num_frames=NUM_VIDEO_FRAMES,
         )
         return {

sglang/srt/multimodal/processors/points_v15_chat.py CHANGED Viewed

@@ -7,12 +7,12 @@ from PIL import Image
 from sglang.srt.models.points_v15_chat import POINTSV15ChatModel
 from sglang.srt.multimodal.processors.qwen_vl import (
-    Qwen2_5VLImageProcessor,
+    QwenVLImageProcessor,
     resize_image_async,
 )
-class POINTSV15ChatProcessor(Qwen2_5VLImageProcessor):
+class POINTSV15ChatProcessor(QwenVLImageProcessor):
     models = [POINTSV15ChatModel]
     def __init__(self, hf_config, server_args, _processor, *args, **kwargs):

sglang/srt/multiplex/multiplexing_mixin.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""
+Mixin class providing multiplexing scheduling logic
+"""
+import logging
+import torch
+import torch.distributed as dist
+from torch.cuda.streams import ExternalStream
+from sglang.srt.distributed.parallel_state import set_pdmux_status
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.multiplex.pdmux_context import (
+    get_current_stream_idx,
+    get_sm_counts,
+    get_stream_groups,
+    initialize_stream_groups,
+    load_pdmux_config,
+    set_current_stream_idx,
+)
+logger = logging.getLogger(__name__)
+class SchedulerMultiplexMixin:
+    def init_pdmux(self):
+        # for pd_multiplexing, Init stream_groups, exclude normal stream for prefill only and decode only
+        self.pdmux_config = load_pdmux_config(self.server_args.pdmux_config_path)
+        initialize_stream_groups(self.gpu_id, self.pdmux_config)
+        self.stream_groups = get_stream_groups()
+        self.sm_counts = get_sm_counts()
+        self.real_sm_group_num = len(self.stream_groups)
+        logger.info(
+            f"PD-Multiplexing enabled with {self.real_sm_group_num} stream groups, sm_counts (prefill_sm, decode_sm): {self.sm_counts}"
+        )
+    # TODO(jason-fxz): This is a temporary demo
+    def adjust_stream_groups(self) -> tuple[int, tuple[ExternalStream, ExternalStream]]:
+        if not self.running_batch.is_empty() and self.split_prefill_batch:
+            decode_bs = self.running_batch.batch_size()
+            manual_divisions = self.pdmux_config.manual_divisions
+            if manual_divisions:
+                for i in range(len(manual_divisions)):
+                    _, _, threshold = manual_divisions[i]
+                    if decode_bs >= threshold:
+                        stream_idx = i + 1
+            else:
+                stream_idx = max(
+                    1,
+                    min(
+                        self.real_sm_group_num - 2,
+                        decode_bs
+                        * (self.real_sm_group_num - 2)
+                        // self.pdmux_config.decode_bs_divisor,
+                    ),
+                )
+            set_current_stream_idx(stream_idx)
+        elif not self.running_batch.is_empty():
+            set_current_stream_idx(self.real_sm_group_num - 1)
+        else:
+            set_current_stream_idx(0)
+        stream_idx = get_current_stream_idx()
+        self.tp_worker.model_runner.update_decode_attn_backend(stream_idx)
+        return stream_idx, self.stream_groups[stream_idx]
+    def update_split_prefill_batch(self, sm_count: int) -> bool:
+        if self.split_prefill_batch:
+            return False
+        # add new request
+        batch = self.get_new_batch_prefill()
+        if batch and not batch.is_empty():
+            batch.forward_mode = (
+                ForwardMode.SPLIT_PREFILL
+            )  # Set forward mode for split prefill
+            self.split_prefill_batch = batch
+            return True
+        return False
+    @torch.inference_mode()
+    def event_loop_pdmux(self):
+        """A scheduler loop for pd multiplexing."""
+        decode_done = False
+        prefill_done = False
+        wait_prefill_kernel_done = False
+        adjust_stream_group = False
+        stream_idx = get_current_stream_idx()
+        stream_group = self.stream_groups[stream_idx]
+        prefill_stream = stream_group[0]
+        decode_stream = stream_group[1]
+        torch.cuda.empty_cache()
+        logger.debug("Starting event loop for pd multiplexing...")
+        while True:
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                recv_reqs = self.recv_requests()
+                self.process_input_requests(recv_reqs)
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                sm_count = self.sm_counts[stream_idx][0]
+                if not wait_prefill_kernel_done:
+                    adjust_stream_group = (
+                        self.update_split_prefill_batch(sm_count) or adjust_stream_group
+                    )
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                self.running_batch = self.update_running_batch(self.running_batch)
+                adjust_stream_group = adjust_stream_group or (
+                    stream_idx > 0 and self.running_batch.is_empty()
+                )
+                if self.running_batch.is_empty() and self.split_prefill_batch is None:
+                    self.check_memory()
+                    self.check_tree_cache()
+                    self.new_token_ratio = self.init_new_token_ratio
+                    self.maybe_sleep_on_idle()
+            if adjust_stream_group:
+                prefill_stream.synchronize()
+                decode_stream.synchronize()
+                stream_idx, stream_group = self.adjust_stream_groups()
+                prefill_stream = stream_group[0]
+                decode_stream = stream_group[1]
+                adjust_stream_group = False
+                logger.debug(
+                    f"Adjusting stream groups: {stream_idx}, prefill sm: {self.sm_counts[stream_idx][0]}, decode sm: {self.sm_counts[stream_idx][1]}"
+                )
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                # process decode batch
+                if self.running_batch and not self.running_batch.is_empty():
+                    decode_result = self.run_batch(self.running_batch)
+                    decode_done = True
+                else:
+                    decode_done = False
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                if (
+                    self.split_prefill_batch
+                    and not self.split_prefill_batch.is_empty()
+                    and not wait_prefill_kernel_done
+                ):
+                    prefill_done = True
+                    forward_count = (
+                        max(
+                            1,
+                            self.pdmux_config.split_forward_token_budget
+                            // self.split_prefill_batch.extend_num_tokens,
+                        )
+                        if self.split_prefill_batch.extend_num_tokens > 0
+                        else self.model_config.num_hidden_layers
+                    )
+                    next_split_index = min(
+                        self.split_prefill_batch.split_index + forward_count,
+                        self.model_config.num_hidden_layers,
+                    )
+                    forward_count = (
+                        next_split_index - self.split_prefill_batch.split_index
+                    )
+                    self.split_prefill_batch.split_forward_count = forward_count
+                    prefill_result = self.run_batch(self.split_prefill_batch)
+                    if next_split_index == self.model_config.num_hidden_layers:
+                        self.split_prefill_batch.split_prefill_finished = True
+                        prefill_exe_done = prefill_stream.record_event()
+                    self.split_prefill_batch.split_index = next_split_index
+                elif wait_prefill_kernel_done:
+                    prefill_done = True
+                else:
+                    prefill_done = False
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                decode_stream.synchronize()
+                if decode_done:
+                    self.process_batch_result(self.running_batch, decode_result)
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                if prefill_done and self.split_prefill_batch.split_prefill_finished:
+                    wait_prefill_kernel_done = True
+                    prefill_exe_done_flag = prefill_exe_done.query()
+                    flags = (
+                        torch.ones(1, device="cpu", dtype=torch.int32)
+                        if prefill_exe_done_flag
+                        else torch.zeros(1, device="cpu", dtype=torch.int32)
+                    )
+                    self.tp_cpu_group.allreduce(flags, dist.ReduceOp.SUM).wait()
+                    if flags.item() == self.tp_size:
+                        self.process_batch_result(
+                            self.split_prefill_batch, prefill_result
+                        )
+                        if self.running_batch and not self.running_batch.is_empty():
+                            self.running_batch.merge_batch(self.split_prefill_batch)
+                        else:
+                            self.running_batch = self.split_prefill_batch
+                        self.split_prefill_batch = None
+                        wait_prefill_kernel_done = False
+                        adjust_stream_group = True

sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4py3-none-any.whl → 0.5.4.post2py3-none-any.whl