PyPI - sglang - Versions diffs - 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl - Mend

sglang 0.5.2rc1py3-none-any.whl → 0.5.3rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (265) hide show

sglang/bench_one_batch_server.py +10 -1
sglang/bench_serving.py +257 -29
sglang/lang/interpreter.py +1 -1
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +50 -6
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +48 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/xgrammar_backend.py +28 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +11 -3
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +15 -12
sglang/srt/disaggregation/decode.py +21 -10
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +18 -10
sglang/srt/disaggregation/nixl/conn.py +180 -16
sglang/srt/disaggregation/prefill.py +5 -3
sglang/srt/disaggregation/utils.py +5 -50
sglang/srt/distributed/parallel_state.py +67 -43
sglang/srt/entrypoints/engine.py +38 -17
sglang/srt/entrypoints/grpc_request_manager.py +580 -0
sglang/srt/entrypoints/grpc_server.py +680 -0
sglang/srt/entrypoints/http_server.py +88 -53
sglang/srt/entrypoints/openai/protocol.py +7 -4
sglang/srt/entrypoints/openai/serving_base.py +46 -3
sglang/srt/entrypoints/openai/serving_chat.py +39 -19
sglang/srt/entrypoints/openai/serving_completions.py +15 -4
sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +7 -4
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +8 -3
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +6 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/activation.py +142 -9
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/attention/ascend_backend.py +11 -4
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashinfer_backend.py +6 -4
sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
sglang/srt/layers/attention/mamba/mamba.py +64 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +18 -1
sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +45 -7
sglang/srt/layers/dp_attention.py +30 -1
sglang/srt/layers/layernorm.py +32 -15
sglang/srt/layers/linear.py +34 -3
sglang/srt/layers/logits_processor.py +29 -10
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +182 -62
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +12 -7
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +76 -47
sglang/srt/layers/quantization/fp8_utils.py +50 -31
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +182 -49
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +68 -41
sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +30 -17
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +76 -38
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +0 -18
sglang/srt/layers/sampler.py +162 -18
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/triton_backend.py +90 -2
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +4 -1
sglang/srt/lora/lora_manager.py +35 -112
sglang/srt/lora/mem_pool.py +24 -10
sglang/srt/lora/utils.py +18 -9
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +200 -199
sglang/srt/managers/data_parallel_controller.py +105 -35
sglang/srt/managers/detokenizer_manager.py +8 -4
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +199 -12
sglang/srt/managers/mm_utils.py +1 -0
sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
sglang/srt/managers/schedule_batch.py +77 -56
sglang/srt/managers/schedule_policy.py +4 -3
sglang/srt/managers/scheduler.py +191 -139
sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
sglang/srt/managers/tokenizer_manager.py +260 -519
sglang/srt/managers/tp_worker.py +53 -4
sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +18 -33
sglang/srt/mem_cache/hiradix_cache.py +108 -48
sglang/srt/mem_cache/memory_pool.py +347 -48
sglang/srt/mem_cache/memory_pool_host.py +121 -57
sglang/srt/mem_cache/radix_cache.py +0 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +0 -2
sglang/srt/metrics/collector.py +502 -77
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +13 -5
sglang/srt/model_executor/forward_batch_info.py +75 -19
sglang/srt/model_executor/model_runner.py +357 -30
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +128 -4
sglang/srt/model_loader/weight_utils.py +2 -1
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +798 -218
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_v2.py +346 -48
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +11 -2
sglang/srt/models/glm4v.py +4 -2
sglang/srt/models/glm4v_moe.py +3 -0
sglang/srt/models/gpt_oss.py +1 -1
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +25 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2.py +7 -0
sglang/srt/models/qwen2_5_vl.py +27 -3
sglang/srt/models/qwen2_moe.py +60 -13
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +40 -9
sglang/srt/models/qwen3_next.py +1042 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/multimodal/processors/dots_vlm.py +99 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +141 -129
sglang/srt/multimodal/processors/qwen_vl.py +15 -5
sglang/srt/offloader.py +27 -3
sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
sglang/srt/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/sampling/sampling_batch_info.py +18 -15
sglang/srt/server_args.py +355 -37
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
sglang/srt/speculative/eagle_utils.py +0 -2
sglang/srt/speculative/eagle_worker.py +197 -112
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/tracing/trace.py +552 -0
sglang/srt/utils.py +46 -3
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/runners.py +4 -0
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_disaggregation_utils.py +66 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_utils.py +28 -1
sglang/utils.py +12 -0
sglang/version.py +1 -1
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{conversation.py → parser/conversation.py} +0 -0
/sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0

sglang/srt/models/dots_vlm.py ADDED Viewed

@@ -0,0 +1,174 @@
+# Copyright 2025 The RedNote HiLab team.
+# Copyright 2025 The SGLang team.
+#
+# This code is based on the DeepseekVL2ForCausalLM and DotsVisionTransformer
+# implementation in this library.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Dots-VL model compatible with HuggingFace weights."""
+from typing import Iterable, List, Optional, Tuple
+import torch
+from torch import nn
+from sglang.srt.configs.dots_vlm import DotsVLMConfig
+from sglang.srt.distributed import parallel_state
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
+from .dots_vlm_vit import DotsVisionTransformer
+class DotsVLMForCausalLM(nn.Module):
+    """DotsVLM model for sglang inference"""
+    def __init__(
+        self, config: DotsVLMConfig, quant_config: Optional[QuantizationConfig] = None
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.image_token_id = config.im_span_id
+        self.video_token_id = config.video_span_id
+        self.language_model = DeepseekV2ForCausalLM(
+            config.language_config, quant_config
+        )
+        # Initialize vision tower (matching transformers naming for weight compatibility)
+        self.vision_tower = DotsVisionTransformer(config.vision_config)
+    def _pad_vit_attn_dummy_heads(self, name: str, loaded_weight: torch.Tensor):
+        """pad attn qkv weights for dummy heads"""
+        num_dummy_heads = self.config.vision_config.num_dummy_heads
+        if num_dummy_heads == 0:
+            return loaded_weight
+        head_dim = self.config.vision_config.head_dim
+        if "attn.qkv_proj" in name:
+            wq, wk, wv = loaded_weight.chunk(3, dim=0)
+            if name.endswith(".weight"):
+                dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+            elif name.endswith(".bias"):
+                dummy_shape = [num_dummy_heads, head_dim]
+            else:
+                raise RuntimeError(f"Unsupported weight with name={name}")
+            pad_func = lambda x: torch.cat(
+                [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+            ).flatten(0, 1)
+            wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+            loaded_weight = torch.cat([wq, wk, wv], dim=0)
+        if "attn.proj.weight" in name:
+            padded_weight = loaded_weight.new_zeros(
+                loaded_weight.shape[0], head_dim * num_dummy_heads
+            )
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+        if "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+            padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+        return loaded_weight
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights for the model, separating vision and language weights"""
+        weights = list(weights)
+        # Separate vision tower weights and language model weights
+        vision_weights = []
+        language_weights = []
+        for name, loaded_weight in weights:
+            if name.startswith("vision_tower."):
+                vision_name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+                vision_weights.append((vision_name, loaded_weight))
+            else:
+                # All other weights go to language model
+                language_weights.append((name, loaded_weight))
+        # Load vision tower weights
+        vision_state_dict = dict(vision_weights)
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in vision_state_dict.items():
+            if name not in params_dict:
+                raise ValueError(f"Weight {name} not found in params_dict")
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            loaded_weight = self._pad_vit_attn_dummy_heads(name, loaded_weight)
+            weight_loader(param, loaded_weight)
+        # Load language model weights
+        if language_weights:
+            self.language_model.load_weights(language_weights)
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return DeepseekV2ForCausalLM.get_model_config_for_expert_location(config)
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        """Pad input_ids with multimodal tokens"""
+        # Get image token ID for padding pattern
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        padded_input_ids = pattern.pad_input_tokens(input_ids, mm_inputs)
+        return padded_input_ids
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # Extract pixel values and grid information (following reference pattern)
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.vision_tower.dtype
+        )
+        image_grid_thw = torch.concat(
+            [item.image_grid_thw for item in items], dim=0
+        ).to(self.vision_tower.device)
+        # Add dimension checks like in reference code
+        assert pixel_values.dim() == 2, f"{pixel_values.dim()=}"
+        assert image_grid_thw.dim() == 2, f"{image_grid_thw.dim()=}"
+        # Process through vision tower
+        image_embeds = self.vision_tower(pixel_values, image_grid_thw)
+        # Ensure consistent dtype for FlashInfer compatibility
+        # Force bfloat16 to match model's expected dtype
+        if image_embeds.dtype != torch.bfloat16 and hasattr(
+            self.language_model.model, "embed_tokens"
+        ):
+            target_dtype = self.language_model.model.embed_tokens.weight.dtype
+            image_embeds = image_embeds.to(target_dtype)
+        return image_embeds
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            multimodal_model=self,
+            language_model=self.language_model,
+        )
+        return hidden_states
+EntryClass = [DotsVLMForCausalLM]

sglang/srt/models/dots_vlm_vit.py ADDED Viewed

@@ -0,0 +1,337 @@
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.nn import LayerNorm
+from transformers.modeling_utils import PreTrainedModel
+from sglang.srt.configs.dots_vlm import DotsVisionConfig
+from sglang.srt.distributed import parallel_state
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.utils import add_prefix
+logger = logging.getLogger(__name__)
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(
+            seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class PatchMerger(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        spatial_merge_size: int = 2,
+        pre_norm="layernorm",
+        init_merger_std=None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.pre_norm = pre_norm
+        if self.pre_norm == "layernorm":
+            self.ln_q = LayerNorm(context_dim, eps=1e-6)
+        elif self.pre_norm == "rmsnorm":
+            self.ln_q = RMSNorm(context_dim, eps=1e-6)
+        else:
+            logger.warning(f"no norm in patch merger: {self.pre_norm}")
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+        if init_merger_std is not None:
+            nn.init.normal_(self.mlp[0].weight, mean=0.0, std=init_merger_std)
+            nn.init.zeros_(self.mlp[0].bias)
+            nn.init.normal_(self.mlp[2].weight, mean=0.0, std=init_merger_std)
+            nn.init.zeros_(self.mlp[2].bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.pre_norm:
+            x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        else:
+            x = self.mlp(x.view(-1, self.hidden_size))
+        return x
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+    def extra_repr(self) -> str:
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class DotsSwiGLUFFN(nn.Module):
+    def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.embed_dim
+        bias = config.use_bias
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
+        self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.fc1(x)) * self.fc3(x)
+        x = self.fc2(x)
+        return x
+class DotsPatchEmbed(nn.Module):
+    def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.num_channels = config.num_channels
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.embed_dim = config.embed_dim
+        self.config = config
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.embed_dim,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
+        x = x.view(
+            -1,
+            self.num_channels,
+            self.temporal_patch_size,
+            self.patch_size,
+            self.patch_size,
+        )[:, :, 0]
+        x = self.proj(x).view(-1, self.embed_dim)
+        x = self.norm(x)
+        return x
+class DotsViTPreprocessor(nn.Module):
+    def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.patch_h = config.patch_size
+        self.patch_w = config.patch_size
+        self.embed_dim = config.embed_dim
+        self.config = config
+        self.patchifier = DotsPatchEmbed(config, quant_config)
+    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
+        tokens = self.patchifier(x, grid_thw)
+        return tokens
+class DotsVisionBlock(nn.Module):
+    def __init__(
+        self,
+        config: DotsVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        attn_implementation: str = "flash_attention_2",
+    ):
+        super().__init__()
+        if attn_implementation == "flash_attention_2":
+            qkv_backend = "fa3"
+            softmax_in_single_precision = False
+        else:
+            raise RuntimeError("Unimplemented")
+        self.attn = VisionAttention(
+            embed_dim=config.embed_dim,
+            num_heads=config.num_attention_heads,
+            projection_size=config.embed_dim,
+            use_qkv_parallel=True,
+            qkv_backend=qkv_backend,
+            softmax_in_single_precision=softmax_in_single_precision,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            num_dummy_heads=config.num_dummy_heads,
+            qkv_bias=config.use_bias,
+            proj_bias=config.use_bias,
+        )
+        self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+        self.mlp = DotsSwiGLUFFN(config, quant_config)
+        self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            position_embeddings=rotary_pos_emb,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class DotsVisionTransformer(PreTrainedModel):
+    def __init__(
+        self,
+        config: DotsVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__(config)
+        self.config = config
+        self._update_vision_config()
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_embed = DotsViTPreprocessor(config, quant_config)
+        self._init_weights(self.patch_embed.patchifier.proj)
+        head_dim = config.embed_dim // config.num_attention_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        _num_hidden_layers = config.num_hidden_layers
+        self.blocks = nn.ModuleList(
+            [
+                DotsVisionBlock(
+                    config, quant_config, f"blocks.{i}", config.attn_implementation
+                )
+                for i in range(_num_hidden_layers)
+            ]
+        )
+        if self.config.post_norm:
+            self.post_trunk_norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+        self.merger = PatchMerger(
+            dim=config.hidden_size,
+            context_dim=config.embed_dim,
+            spatial_merge_size=config.spatial_merge_size,
+            init_merger_std=self.config.init_merger_std,
+            quant_config=quant_config,
+        )
+        self.gradient_checkpointing = False
+    def _update_vision_config(self):
+        """update vision config to support tp"""
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        num_heads = self.config.num_attention_heads
+        head_dim = self.config.embed_dim // num_heads
+        num_dummy_heads = 0
+        if num_heads % world_size != 0:
+            num_dummy_heads = (
+                (num_heads + world_size) // world_size
+            ) * world_size - num_heads
+        setattr(self.config, "head_dim", head_dim)
+        setattr(self.config, "num_dummy_heads", num_dummy_heads)
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+    @property
+    def device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+    def get_pos_ids_by_grid(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        return pos_ids
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = self.get_pos_ids_by_grid(grid_thw)
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def calc_cos_sin(self, rotary_pos_emb):
+        cos = rotary_pos_emb.cos()
+        sin = rotary_pos_emb.sin()
+        cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+        sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+        rotary_pos_emb = (cos, sin)
+        return rotary_pos_emb
+    def forward(
+        self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, bf16=True
+    ) -> torch.Tensor:
+        if bf16:
+            hidden_states = hidden_states.bfloat16()
+        hidden_states = self.patch_embed(hidden_states, grid_thw)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = self.calc_cos_sin(rotary_pos_emb)
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(
+            dim=0,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+            )
+        if self.config.post_norm:
+            hidden_states = self.post_trunk_norm(hidden_states)
+        hidden_states = self.merger(hidden_states)
+        return hidden_states

sglang/srt/models/ernie4.py CHANGED Viewed

@@ -92,7 +92,7 @@ class Ernie4Moe(nn.Module):
             correction_bias=self.gate.e_score_correction_bias,
         )
-        self.experts = get_moe_impl_class()(
+        self.experts = get_moe_impl_class(quant_config)(
             num_experts=config.moe_num_experts,
             top_k=config.moe_k,
             hidden_size=config.hidden_size,

sglang/srt/models/gemma3n_mm.py CHANGED Viewed

@@ -499,7 +499,7 @@ class Gemma3nForConditionalGeneration(PreTrainedModel):
     def should_apply_lora(self, module_name: str) -> bool:
         return bool(self.lora_pattern.match(module_name))
-    def get_hidden_dim(self, module_name):
+    def get_hidden_dim(self, module_name, layer_idx):
         # return input_dim, output_dim
         if module_name == "qkv_proj":
             return (

sglang/srt/models/glm4_moe.py CHANGED Viewed

@@ -153,7 +153,13 @@ class Glm4MoeMLP(nn.Module):
             )
         self.act_fn = SiluAndMul()
-    def forward(self, x, forward_batch=None, should_allreduce_fusion=False):
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        should_allreduce_fusion=False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ):
         if (self.tp_size == 1) and x.shape[0] == 0:
             return x
@@ -423,7 +429,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
             routed_scaling_factor=self.routed_scaling_factor,
         )
-        self.experts = get_moe_impl_class()(
+        self.experts = get_moe_impl_class(quant_config)(
             num_experts=config.n_routed_experts
             + self.num_fused_shared_experts
             + global_server_args_dict["ep_num_redundant_experts"],
@@ -501,6 +507,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
         hidden_states: torch.Tensor,
         should_allreduce_fusion: bool = False,
         use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
     ) -> torch.Tensor:
         current_stream = torch.cuda.current_stream()
@@ -543,6 +550,7 @@ class Glm4MoeSparseMoeBlock(DeepseekV2MoE):
         hidden_states: torch.Tensor,
         should_allreduce_fusion: bool = False,
         use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
     ) -> torch.Tensor:
         if hasattr(self, "shared_experts") and use_intel_amx_backend(
             self.shared_experts.gate_up_proj
@@ -666,6 +674,7 @@ class Glm4MoeDecoderLayer(DeepseekV2DecoderLayer):
         forward_batch: ForwardBatch,
         residual: Optional[torch.Tensor],
         zero_allocator: BumpAllocator,
+        gemm_output_zero_allocator: BumpAllocator = None,
     ) -> torch.Tensor:
         hidden_states, residual = self.layer_communicator.prepare_attn(
             hidden_states, residual, forward_batch

sglang/srt/models/glm4v.py CHANGED Viewed

@@ -93,9 +93,8 @@ class Glm4vVisionBlock(Qwen2_5_VisionBlock):
             quant_config=quant_config,
             prefix=prefix,
             num_dummy_heads=config.num_dummy_heads,
+            rms_norm_eps=config.rms_norm_eps,
         )
-        self.norm1 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.norm2 = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.mlp = Glm4vVisionMLP(
             config.hidden_size,
@@ -498,6 +497,9 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
         self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
         self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
     def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
         pixel_values = torch.cat(
             [item.feature.squeeze(0) for item in items], dim=0

sglang/srt/models/glm4v_moe.py CHANGED Viewed

@@ -74,6 +74,9 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
         self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
         self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
     def determine_num_fused_shared_experts(
         self, architecture: str = "Glm4MoeForCausalLM"
     ):

sglang/srt/models/gpt_oss.py CHANGED Viewed

@@ -121,7 +121,7 @@ class GptOssSparseMoeBlock(nn.Module):
         )
         self.top_k = config.num_experts_per_tok
-        experts_type = get_moe_impl_class()
+        experts_type = get_moe_impl_class(quant_config)
         extra_kwargs = {}
         if experts_type.__name__ == "FusedMoE":
             quant_config_name = (

sglang/srt/models/internvl.py CHANGED Viewed

@@ -26,8 +26,10 @@ from sglang.srt.managers.schedule_batch import (
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.deepseek_janus_pro import DropPath
+from sglang.srt.models.gpt_oss import GptOssForCausalLM
 from sglang.srt.models.internlm2 import InternLM2ForCausalLM
 from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.models.qwen3 import Qwen3ForCausalLM
 from sglang.srt.models.qwen3_moe import Qwen3MoeForCausalLM
 from sglang.utils import logger
@@ -445,6 +447,14 @@ class InternVLChatModel(nn.Module):
             self.language_model = Qwen3MoeForCausalLM(
                 config=config.llm_config, quant_config=quant_config
             )
+        elif config.llm_config.architectures[0] == "GptOssForCausalLM":
+            self.language_model = GptOssForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "Qwen3ForCausalLM":
+            self.language_model = Qwen3ForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
         else:
             raise NotImplementedError(
                 f"{config.llm_config.architectures[0]} is not implemented."
@@ -577,6 +587,15 @@ class InternVLChatModel(nn.Module):
                 ckpt_up_proj_name="up_proj",
                 num_experts=self.config.num_experts,
             )
+        elif "Qwen3ForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
@@ -661,6 +680,15 @@ class InternVLChatModel(nn.Module):
             loaded_params.add(name)
         unloaded_params = params_dict.keys() - loaded_params
+        # Skip params that are created by quantization wrappers and are not expected in the ckpt
+        _quant_only_fragments = (
+            "weight_scale",  # per-matrix FP8 scales (e.g., w2_weight_scale, w13_weight_scale)
+        )
+        unloaded_params = {
+            n
+            for n in unloaded_params
+            if not any(frag in n for frag in _quant_only_fragments)
+        }
         if unloaded_params:
             raise RuntimeError(
                 f"Some weights are not initialized from checkpoints: {unloaded_params}"

sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

sglang 0.5.2rc1py3-none-any.whl → 0.5.3rc0py3-none-any.whl