PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -25,10 +25,11 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
     cutlass_fp8_supported,
-    is_sm100_supported,
+    is_blackwell_supported,
 )
 from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
@@ -49,8 +50,10 @@ if TYPE_CHECKING:
     )
     from sglang.srt.single_batch_overlap import DownGemmOverlapArgs
-if is_cuda():
-    from sgl_kernel import scaled_fp4_quant
+try:
+    from flashinfer import fp4_quantize
+except ImportError:
+    fp4_quantize = None
 try:
     from flashinfer import mm_fp4 as fp4_gemm
@@ -466,8 +469,6 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             # Fp8 moe kernel needs single weight scale for w13 per expert.
             # We take the max of the w1 and w3 scales then dequant and requant each expert.
             if layer.w13_weight_scale.dim() == 2:  # Shape: (num_experts, 2)
-                from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
                 # Get the maximum scale across w1 and w3 for each expert
                 max_w13_scales = layer.w13_weight_scale.max(dim=1).values
@@ -515,6 +516,84 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
                 layer.w2_input_scale.max(), requires_grad=False
             )
+        # Align FP8 weights to FlashInfer per-tensor kernel layout if enabled
+        if should_use_flashinfer_trtllm_moe():
+            from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a
+            # 1) Swap W13 halves: [Up, Gate] -> [Gate, Up] expected by FI
+            num_experts, two_n, hidden = layer.w13_weight.shape
+            inter = two_n // 2
+            w13_swapped = (
+                layer.w13_weight.reshape(num_experts, 2, inter, hidden)
+                .flip(dims=[1])
+                .reshape(num_experts, two_n, hidden)
+            )
+            # 2) Reorder rows for fused gated activation (W13)
+            w13_interleaved = [
+                reorder_rows_for_gated_act_gemm(w13_swapped[i])
+                for i in range(num_experts)
+            ]
+            w13_interleaved = torch.stack(w13_interleaved).reshape(
+                num_experts, two_n, hidden
+            )
+            # 3) Shuffle weights for transposed MMA output (both W13, W2)
+            epilogue_tile_m = 128
+            w13_shuffled = [
+                shuffle_matrix_a(w13_interleaved[i].view(torch.uint8), epilogue_tile_m)
+                for i in range(num_experts)
+            ]
+            w2_shuffled = [
+                shuffle_matrix_a(layer.w2_weight[i].view(torch.uint8), epilogue_tile_m)
+                for i in range(num_experts)
+            ]
+            layer.w13_weight = Parameter(
+                torch.stack(w13_shuffled).view(torch.float8_e4m3fn),
+                requires_grad=False,
+            )
+            layer.w2_weight = Parameter(
+                torch.stack(w2_shuffled).view(torch.float8_e4m3fn),
+                requires_grad=False,
+            )
+        # Precompute and register per-expert output scaling factors for FI MoE
+        if should_use_flashinfer_trtllm_moe():
+            # Note: w13_input_scale and w2_input_scale are scalar Parameters post-reduction
+            assert (
+                hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None
+            )
+            assert hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None
+            assert (
+                hasattr(layer, "w13_weight_scale")
+                and layer.w13_weight_scale is not None
+            )
+            assert (
+                hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None
+            )
+            input_scale = layer.w13_input_scale.to(torch.float32)
+            activation_scale = layer.w2_input_scale.to(torch.float32)
+            w13_weight_scale = layer.w13_weight_scale.to(torch.float32)
+            w2_weight_scale = layer.w2_weight_scale.to(torch.float32)
+            output1_scales_scalar = (
+                w13_weight_scale * input_scale * (1.0 / activation_scale)
+            )
+            output1_scales_gate_scalar = w13_weight_scale * input_scale
+            output2_scales_scalar = activation_scale * w2_weight_scale
+            layer.output1_scales_scalar = Parameter(
+                output1_scales_scalar, requires_grad=False
+            )
+            layer.output1_scales_gate_scalar = Parameter(
+                output1_scales_gate_scalar, requires_grad=False
+            )
+            layer.output2_scales_scalar = Parameter(
+                output2_scales_scalar, requires_grad=False
+            )
     def create_moe_runner(
         self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
     ):
@@ -526,6 +605,81 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         layer: torch.nn.Module,
         dispatch_output: StandardDispatchOutput,
     ) -> CombineInput:
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        # Fast path: TRT-LLM FP8 per-tensor MoE using BYPASSED TopK routing
+        from sglang.srt.layers.moe.topk import TopKOutputChecker
+        if should_use_flashinfer_trtllm_moe() and TopKOutputChecker.format_is_bypassed(
+            topk_output
+        ):
+            router_logits = topk_output.router_logits
+            topk_config = topk_output.topk_config
+            # Constraints
+            assert (
+                self.moe_runner_config.activation == "silu"
+            ), "Only silu is supported for flashinfer fp8 moe"
+            from flashinfer import RoutingMethodType
+            from flashinfer.fused_moe import trtllm_fp8_per_tensor_scale_moe
+            correction_bias = (
+                None
+                if topk_config.correction_bias is None
+                else topk_config.correction_bias
+            )
+            # Pre-quantize activations to FP8 per-tensor using provided input scale
+            x_fp8, _ = scaled_fp8_quant(x, layer.w13_input_scale)
+            use_routing_scales_on_input = True
+            routed_scaling_factor = self.moe_runner_config.routed_scaling_factor
+            # Enforce Llama4 routing for ModelOpt FP8 MoE for now.
+            # TODO(brayden): support other routing methods
+            assert topk_config.top_k == 1, "ModelOpt FP8 MoE requires top_k==1"
+            assert (
+                not topk_config.num_expert_group
+            ), "ModelOpt FP8 MoE does not support expert grouping"
+            assert (
+                not topk_config.topk_group
+            ), "ModelOpt FP8 MoE does not support grouped top-k"
+            routing_method_type = RoutingMethodType.Llama4
+            # FlashInfer TRTLLM requires routing_logits (and bias) to be bfloat16
+            routing_logits_cast = router_logits.to(torch.bfloat16)
+            routing_bias_cast = (
+                None if correction_bias is None else correction_bias.to(torch.bfloat16)
+            )
+            output = trtllm_fp8_per_tensor_scale_moe(
+                routing_logits=routing_logits_cast,
+                routing_bias=routing_bias_cast,
+                hidden_states=x_fp8,
+                gemm1_weights=layer.w13_weight,
+                output1_scales_scalar=layer.output1_scales_scalar,
+                output1_scales_gate_scalar=layer.output1_scales_gate_scalar,
+                gemm2_weights=layer.w2_weight,
+                output2_scales_scalar=layer.output2_scales_scalar,
+                num_experts=layer.num_experts,
+                top_k=topk_config.top_k,
+                n_group=0,
+                topk_group=0,
+                intermediate_size=layer.w2_weight.shape[2],
+                local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+                local_num_experts=layer.num_local_experts,
+                routed_scaling_factor=(
+                    routed_scaling_factor if routed_scaling_factor is not None else 1.0
+                ),
+                use_routing_scales_on_input=use_routing_scales_on_input,
+                tile_tokens_dim=8,  # TODO(brayden): use the FI tile calculation
+                routing_method_type=routing_method_type,
+            )
+            from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+            return StandardCombineInput(hidden_states=output)
         quant_info = TritonMoeQuantInfo(
             w13_weight=layer.w13_weight,
@@ -867,10 +1021,9 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
         output_shape = [x_m, w_n]
         # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
-        x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv)
+        x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv)
         assert x_fp4.dtype == torch.uint8
-        assert x_scale_interleaved.dtype == torch.float8_e4m3fn
         assert layer.weight.dtype == torch.uint8
         assert layer.weight_scale_interleaved.dtype == torch.float8_e4m3fn
         assert layer.alpha.dtype == torch.float32
@@ -903,7 +1056,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
     def __init__(self, quant_config: ModelOptFp4Config):
         self.quant_config = quant_config
-        if not is_sm100_supported():
+        if not is_blackwell_supported():
             raise ValueError(
                 "Current platform does not support NVFP4"
                 " quantization. Please use Blackwell and"
@@ -1383,8 +1536,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
         alt_stream=None,
     ) -> CombineInput:
-        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
         x = dispatch_output.hidden_states
         topk_output = dispatch_output.topk_output
@@ -1397,6 +1548,8 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
         # Check if this is a FlashInferFP4MoE layer that should handle its own forward
         if hasattr(layer, "gemm1_weights_fp4_shuffled"):
             # This layer was processed with flashinfer TRTLLM - delegate to its own forward
+            from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
             return StandardCombineInput(hidden_states=layer.forward(x, topk_output))
         if self.enable_flashinfer_cutlass_moe:
@@ -1410,7 +1563,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
             output_dtype = x.dtype
             x_sf = None
             if should_use_flashinfer_cutlass_moe_fp4_allgather():
-                from flashinfer import fp4_quantize, nvfp4_block_scale_interleave
+                from flashinfer import nvfp4_block_scale_interleave
                 # Quantize before comm, swizzle after.
                 if x.shape[0] > 0:
@@ -1465,6 +1618,8 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
                 if forward_shared_experts is not None:
                     torch.cuda.current_stream().wait_stream(alt_stream)
+            from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
             return StandardCombineInput(hidden_states=output)
         from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
@@ -1486,6 +1641,8 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
             apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
         ).to(x.dtype)
         # Scale by routed_scaling_factor is fused into select_experts.
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
         return StandardCombineInput(hidden_states=output)
     def apply_without_routing_weights(

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -125,8 +125,13 @@ class RotaryEmbedding(CustomOp):
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
+        self._apply_rotary_emb_wrapped = _apply_rotary_emb
         if get_global_server_args().rl_on_policy_target == "fsdp":
             self._forward_method = self.forward_native
+            self._apply_rotary_emb_wrapped = torch.compile(dynamic=True)(
+                self._apply_rotary_emb_wrapped
+            )
     def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
         """Compute the inverse frequency."""
@@ -185,14 +190,16 @@ class RotaryEmbedding(CustomOp):
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., : self.rotary_dim]
         query_pass = query[..., self.rotary_dim :]
-        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query_rot = self._apply_rotary_emb_wrapped(
+            query_rot, cos, sin, self.is_neox_style
+        )
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
         key_rot = key[..., : self.rotary_dim]
         key_pass = key[..., self.rotary_dim :]
-        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key_rot = self._apply_rotary_emb_wrapped(key_rot, cos, sin, self.is_neox_style)
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
@@ -312,10 +319,20 @@ class RotaryEmbedding(CustomOp):
         query: torch.Tensor,
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
+        fused_set_kv_buffer_arg: Optional[FusedSetKVBufferArg] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # TODO: make a wrapper, and XPU will implement this kernel later.
-        self.cos_sin_cache = self.cos_sin_cache.to(query.device)
-        return self.forward_native(positions, query, key, offsets)
+        assert (
+            fused_set_kv_buffer_arg is None
+        ), "fused_set_kv_buffer_arg is not supported for xpu implementation"
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        return torch.ops.sgl_kernel.rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
 class LinearScalingRotaryEmbedding(RotaryEmbedding):
@@ -1070,6 +1087,7 @@ def _triton_mrope_forward(
     mrope_section_h: tl.constexpr,
     mrope_section_w: tl.constexpr,
     is_interleaved: tl.constexpr,
+    is_neox_style: tl.constexpr,
 ):
     # Adapted from
     # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py
@@ -1124,51 +1142,99 @@ def _triton_mrope_forward(
     # program instance (i.e. for the current token) separately
     # ####################################################################
     # left half of the head
-    first_half_q_offsets = (
-        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_half_k_offsets = (
-        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
-    )
-    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < rd // 2
-    )
-    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
-        tl.arange(0, pad_hd // 2)[None, :] < rd // 2
-    )
+    if is_neox_style:
+        first_half_q_offsets = (
+            tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+        )
+        first_half_k_offsets = (
+            tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+        )
+        first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
+            tl.arange(0, pad_hd // 2)[None, :] < rd // 2
+        )
+        first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
+            tl.arange(0, pad_hd // 2)[None, :] < rd // 2
+        )
-    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
-        sin_row.dtype
-    )
+        q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
+            sin_row.dtype
+        )
+        k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
+            sin_row.dtype
+        )
-    # right half of the head
-    second_half_q_offsets = first_half_q_offsets + (rd // 2)
-    second_half_k_offsets = first_half_k_offsets + (rd // 2)
-    second_q_mask = first_q_mask
-    second_k_mask = first_k_mask
+        # right half of the head
+        second_half_q_offsets = first_half_q_offsets + (rd // 2)
+        second_half_k_offsets = first_half_k_offsets + (rd // 2)
+        second_q_mask = first_q_mask
+        second_k_mask = first_k_mask
+        q_tile_2 = tl.load(
+            q_ptr + second_half_q_offsets, mask=second_q_mask, other=0
+        ).to(sin_row.dtype)
+        k_tile_2 = tl.load(
+            k_ptr + second_half_k_offsets, mask=second_k_mask, other=0
+        ).to(sin_row.dtype)
+        # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+        # Since cos and sin are now half-size,
+        # we use the same cos_row and sin_row for both halves
+        new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+        new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+    else:
+        base_q = tl.arange(0, pad_n_qh)[:, None] * hd
+        base_k = tl.arange(0, pad_n_kh)[:, None] * hd
+        even_idx = 2 * tl.arange(0, pad_hd // 2)[None, :]
+        odd_idx = even_idx + 1
+        even_q_offsets = base_q + even_idx
+        odd_q_offsets = base_q + odd_idx
+        even_k_offsets = base_k + even_idx
+        odd_k_offsets = base_k + odd_idx
+        idx_mask = tl.arange(0, pad_hd // 2)[None, :] < (rd // 2)
+        qn_mask = tl.arange(0, pad_n_qh)[:, None] < n_qh
+        kn_mask = tl.arange(0, pad_n_kh)[:, None] < n_kh
+        even_q_mask = qn_mask & idx_mask
+        odd_q_mask = qn_mask & idx_mask
+        even_k_mask = kn_mask & idx_mask
+        odd_k_mask = kn_mask & idx_mask
+        q_tile_1 = tl.load(q_ptr + even_q_offsets, mask=even_q_mask, other=0).to(
+            sin_row.dtype
+        )
+        k_tile_1 = tl.load(k_ptr + even_k_offsets, mask=even_k_mask, other=0).to(
+            sin_row.dtype
+        )
-    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
-        sin_row.dtype
-    )
-    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
-        sin_row.dtype
-    )
+        q_tile_2 = tl.load(q_ptr + odd_q_offsets, mask=odd_q_mask, other=0).to(
+            sin_row.dtype
+        )
+        k_tile_2 = tl.load(k_ptr + odd_k_offsets, mask=odd_k_mask, other=0).to(
+            sin_row.dtype
+        )
-    # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
-    # Since cos and sin are now half-size,
-    # we use the same cos_row and sin_row for both halves
-    new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
-    tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
-    new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
-    tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+        # y = [x_even, x_odd] * [cos, cos] + [-x_odd, x_even] * [sin, sin]
+        # NeoX-style rotary embedding:
+        # Each (even, odd) channel pair forms one rotation arm.
+        # cos_row and sin_row each have length rd//2, shared across all (even, odd) pairs.
+        new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+        tl.store(q_ptr + even_q_offsets, new_q_tile_1, mask=even_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+        tl.store(q_ptr + odd_q_offsets, new_q_tile_2, mask=odd_q_mask)
-    new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
-    tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
-    new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
-    tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+        new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+        tl.store(k_ptr + even_k_offsets, new_k_tile_1, mask=even_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+        tl.store(k_ptr + odd_k_offsets, new_k_tile_2, mask=odd_k_mask)
 def triton_mrope(
@@ -1180,6 +1246,7 @@ def triton_mrope(
     head_size: int,
     rotary_dim: int,
     mrope_interleaved: bool,
+    is_neox_style: bool,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """The mrope triton kernel.
@@ -1230,6 +1297,7 @@ def triton_mrope(
         mrope_section[1],
         mrope_section[2],
         mrope_interleaved,
+        is_neox_style,
     )
     return q, k
@@ -1373,6 +1441,7 @@ class MRotaryEmbedding(RotaryEmbedding):
         else:
             return self._forward_native(positions, query, key)
+    @torch.compile(dynamic=True, backend=get_compiler_backend())
     def _forward_triton(
         self,
         positions: torch.Tensor,
@@ -1391,6 +1460,7 @@ class MRotaryEmbedding(RotaryEmbedding):
         if positions.ndim == 2:
             assert self.mrope_section
+            torch._dynamo.graph_break()
             q, k = triton_mrope(
                 query,
                 key,
@@ -1400,7 +1470,9 @@ class MRotaryEmbedding(RotaryEmbedding):
                 self.head_size,
                 self.rotary_dim,
                 self.mrope_interleaved,
+                self.is_neox_style,
             )
+            torch._dynamo.graph_break()
             return q.reshape(query_shape), k.reshape(key_shape)

sglang/srt/lora/lora_registry.py CHANGED Viewed

@@ -205,3 +205,12 @@ class LoRARegistry:
         Returns the total number of LoRA adapters currently registered.
         """
         return len(self._registry)
+    def get_all_adapters(self) -> Dict[str, LoRARef]:
+        """
+        Returns a dictionary of all registered LoRA adapters.
+        Returns:
+            Dict[str, LoRARef]: A dictionary mapping LoRA names to LoRARef objects.
+        """
+        return dict(self._registry)

sglang/srt/managers/async_mm_data_processor.py ADDED Viewed

@@ -0,0 +1,122 @@
+import asyncio
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Any, Dict, List, Optional, Union
+logger = logging.getLogger(__name__)
+class AsyncMMDataProcessor:
+    """
+    Async wrapper for a multimodal processor.
+    Behavior:
+      - If the underlying processor exposes `process_mm_data_async`, call/await it directly.
+      - Otherwise, fall back to running a synchronous `process_mm_data` in a thread pool.
+      - Optionally guard per-call concurrency via an asyncio.Semaphore.
+      - Optionally enforce per-call timeout via asyncio.wait_for.
+    """
+    def __init__(
+        self,
+        mm_processor: Any,
+        *,
+        max_concurrent_calls: Optional[int] = None,
+        timeout_s: Optional[float] = None,
+    ) -> None:
+        """
+        Args:
+            mm_processor: An object exposing either
+                - async def process_mm_data_async(...): -> Dict[str, Any]
+              or
+                - def process_mm_data(...): -> Dict[str, Any]
+            max_concurrent_calls: Optional concurrency cap for per-call execution.
+            timeout_s: Optional timeout (seconds) for each `process()` call.
+        """
+        self.mm_processor = mm_processor
+        self.timeout_s = timeout_s
+        # Concurrency guard (None -> unlimited)
+        self.semaphore = (
+            asyncio.Semaphore(max_concurrent_calls) if max_concurrent_calls else None
+        )
+        # Detect async path; if missing, prepare a fallback executor for sync path
+        self._proc_async = getattr(mm_processor, "process_mm_data_async", None)
+        self.is_async = asyncio.iscoroutinefunction(self._proc_async)
+        self.fallback_exec: Optional[ThreadPoolExecutor] = (
+            ThreadPoolExecutor(max_workers=max_concurrent_calls)
+            if not self.is_async
+            else None
+        )
+    async def process(
+        self,
+        *,
+        image_data: Optional[List[Union[str, bytes]]] = None,
+        audio_data: Optional[List[Union[str, bytes]]] = None,
+        input_text_or_ids: Union[str, List[int], None] = None,
+        request_obj: Any,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        """
+        Public entrypoint: process a single multimodal request without blocking the event loop.
+        """
+        async def _invoke() -> Dict[str, Any]:
+            if self.is_async:
+                # Native async implementation
+                return await self._proc_async(
+                    image_data=image_data,
+                    audio_data=audio_data,
+                    input_text=input_text_or_ids,
+                    request_obj=request_obj,
+                    **kwargs,
+                )
+            # Synchronous fallback
+            sync_fn = getattr(self.mm_processor, "process_mm_data", None)
+            if not callable(sync_fn):
+                raise RuntimeError(
+                    "mm_processor has neither 'process_mm_data_async' nor 'process_mm_data'."
+                )
+            loop = asyncio.get_running_loop()
+            fn = partial(
+                sync_fn,
+                image_data=image_data,
+                audio_data=audio_data,
+                input_text=input_text_or_ids,
+                request_obj=request_obj,
+                **kwargs,
+            )
+            return await loop.run_in_executor(self.fallback_exec, fn)
+        # Apply optional concurrency guard
+        if self.semaphore is not None:
+            async with self.semaphore:
+                if self.timeout_s is not None:
+                    return await asyncio.wait_for(_invoke(), timeout=self.timeout_s)
+                return await _invoke()
+        # No concurrency guard
+        if self.timeout_s is not None:
+            return await asyncio.wait_for(_invoke(), timeout=self.timeout_s)
+        return await _invoke()
+    def shutdown(self) -> None:
+        """Gracefully shutdown resources owned by this wrapper."""
+        try:
+            if self.fallback_exec:
+                self.fallback_exec.shutdown(wait=False)
+        except Exception:
+            logger.exception(
+                "Error while shutting down fallback executor in AsyncMMDataProcessor"
+            )
+    def __del__(self):
+        # Best-effort shutdown
+        try:
+            self.shutdown()
+        except Exception:
+            pass

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl