PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +321 -31
sglang/bench_serving.py +10 -3
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/load_config.py +8 -0
sglang/srt/configs/model_config.py +160 -105
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/constrained/base_grammar_backend.py +1 -0
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +6 -4
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/common/conn.py +266 -98
sglang/srt/disaggregation/decode.py +50 -9
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/mooncake/conn.py +51 -541
sglang/srt/disaggregation/nixl/conn.py +148 -39
sglang/srt/disaggregation/prefill.py +31 -14
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +135 -80
sglang/srt/entrypoints/engine.py +23 -3
sglang/srt/entrypoints/grpc_request_manager.py +330 -55
sglang/srt/entrypoints/grpc_server.py +232 -102
sglang/srt/entrypoints/http_server.py +49 -9
sglang/srt/entrypoints/openai/protocol.py +110 -5
sglang/srt/entrypoints/openai/serving_base.py +25 -6
sglang/srt/entrypoints/openai/serving_chat.py +178 -49
sglang/srt/entrypoints/openai/serving_completions.py +5 -3
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +42 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/function_call/function_call_parser.py +3 -2
sglang/srt/function_call/glm4_moe_detector.py +3 -3
sglang/srt/function_call/gpt_oss_detector.py +23 -0
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
sglang/srt/layers/activation.py +7 -6
sglang/srt/layers/attention/aiter_backend.py +14 -15
sglang/srt/layers/attention/ascend_backend.py +108 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +112 -194
sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
sglang/srt/layers/attention/mamba/mamba.py +566 -1
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +42 -9
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/communicator.py +8 -0
sglang/srt/layers/dp_attention.py +11 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +2 -0
sglang/srt/layers/linear.py +21 -4
sglang/srt/layers/logits_processor.py +15 -2
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +147 -74
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
sglang/srt/layers/moe/utils.py +10 -0
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/fp8.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +1 -1
sglang/srt/layers/quantization/modelopt_quant.py +44 -9
sglang/srt/layers/quantization/mxfp4.py +12 -4
sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
sglang/srt/layers/quantization/w4afp8.py +0 -4
sglang/srt/layers/quantization/w8a8_int8.py +15 -3
sglang/srt/layers/rotary_embedding.py +78 -31
sglang/srt/layers/sampler.py +52 -4
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +10 -4
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +17 -6
sglang/srt/lora/mem_pool.py +1 -1
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +42 -142
sglang/srt/managers/data_parallel_controller.py +11 -46
sglang/srt/managers/detokenizer_manager.py +11 -11
sglang/srt/managers/io_struct.py +162 -118
sglang/srt/managers/mm_utils.py +43 -6
sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +167 -86
sglang/srt/managers/schedule_policy.py +143 -16
sglang/srt/managers/scheduler.py +359 -214
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
sglang/srt/managers/tokenizer_manager.py +84 -136
sglang/srt/managers/tp_worker.py +39 -29
sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +14 -20
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +40 -1
sglang/srt/mem_cache/hiradix_cache.py +119 -32
sglang/srt/mem_cache/memory_pool.py +188 -10
sglang/srt/mem_cache/memory_pool_host.py +134 -182
sglang/srt/mem_cache/radix_cache.py +222 -71
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
sglang/srt/mem_cache/swa_radix_cache.py +25 -34
sglang/srt/metrics/collector.py +82 -120
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +39 -32
sglang/srt/model_executor/forward_batch_info.py +23 -38
sglang/srt/model_executor/model_runner.py +131 -183
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/loader.py +14 -10
sglang/srt/model_loader/weight_utils.py +156 -2
sglang/srt/models/bailing_moe.py +27 -4
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +536 -153
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +3 -3
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +1 -1
sglang/srt/models/glm4v_moe.py +1 -1
sglang/srt/models/gpt_oss.py +7 -30
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/longcat_flash.py +1 -1
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/mllama4.py +15 -4
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +2 -2
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +64 -1
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +31 -3
sglang/srt/models/qwen3_next.py +36 -9
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +2 -3
sglang/srt/multimodal/processors/internvl.py +20 -8
sglang/srt/multimodal/processors/qwen_vl.py +8 -1
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/sampling/sampling_batch_info.py +20 -2
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +753 -295
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
sglang/srt/speculative/eagle_worker.py +57 -25
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +47 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +399 -74
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +1 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +12 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +355 -4
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0

sglang/srt/models/starcoder2.py ADDED Viewed

@@ -0,0 +1,357 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/starcoder2.py
+""" PyTorch Starcoder2 model."""
+from collections.abc import Iterable
+from typing import Optional, Tuple
+import torch
+from torch import nn
+from transformers import Starcoder2Config
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+class Starcoder2Attention(nn.Module):
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = 0,
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = config.rope_theta
+        self.max_position_embeddings = config.max_position_embeddings
+        self.use_bias = config.use_bias
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=self.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=self.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+class Starcoder2MLP(nn.Module):
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.c_fc = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
+        )
+        self.c_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
+        )
+        self.act = get_act_fn(config.hidden_act)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+class Starcoder2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Starcoder2Attention(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = Starcoder2MLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.norm_epsilon
+        )
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Starcoder2Model(nn.Module):
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        pp_group = get_pp_group()
+        pp_size = pp_group.world_size
+        pp_rank = pp_group.rank
+        self.start_layer = pp_rank * config.num_hidden_layers // pp_size
+        self.end_layer = (pp_rank + 1) * config.num_hidden_layers // pp_size
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Starcoder2DecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = inputs_embeds
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+class Starcoder2ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.model = Starcoder2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=quant_config,
+                prefix=f"{prefix}.lm_head",
+            )
+        self.logits_processor = LogitsProcessor(config=config)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            inputs_embeds=inputs_embeds,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freqs" in name:
+                continue
+            is_stacked = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name in name:
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, shard_id)
+                    is_stacked = True
+                    break
+            if is_stacked:
+                continue
+            param = params_dict.get(name)
+            if param is None:
+                continue
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+EntryClass = Starcoder2ForCausalLM

sglang/srt/models/torch_native_llama.py CHANGED Viewed

@@ -66,8 +66,8 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.utils import add_prefix
-tp_size = get_tensor_model_parallel_world_size()
-tp_rank = get_tensor_model_parallel_rank()
+tp_size: Optional[int] = None
+tp_rank: Optional[int] = None
 def gate_up_proj_weight_loader(
@@ -341,6 +341,13 @@ class LlamaModel(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
+        global tp_size, tp_rank
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size

sglang/srt/models/utils.py ADDED Viewed

@@ -0,0 +1,51 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import torch
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import is_cuda
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import FusedSetKVBufferArg
+def enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
+    """Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
+    return _is_cuda and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
+def create_fused_set_kv_buffer_arg(
+    value: torch.Tensor,
+    layer: RadixAttention,
+    forward_batch: ForwardBatch,
+):
+    layer_id = layer.layer_id
+    token_to_kv_pool = forward_batch.token_to_kv_pool
+    k_buffer = token_to_kv_pool.get_key_buffer(layer_id)
+    v_buffer = token_to_kv_pool.get_value_buffer(layer_id)
+    return FusedSetKVBufferArg(
+        value=value,
+        k_buffer=k_buffer.view(k_buffer.shape[0], -1),
+        v_buffer=v_buffer.view(v_buffer.shape[0], -1),
+        k_scale=layer.k_scale,
+        v_scale=layer.v_scale,
+        cache_loc=forward_batch.out_cache_loc,
+    )

sglang/srt/multimodal/processors/base_processor.py CHANGED Viewed

@@ -234,19 +234,27 @@ class BaseMultimodalProcessor(ABC):
             and isinstance(processor.image_processor, BaseImageProcessorFast)
             and not self.server_args.disable_fast_image_processor
         ):
-            kwargs["device"] = "cuda" if not _is_npu else "npu"
+            if not _is_npu:
+                kwargs["device"] = "cuda"
+            elif processor.__class__.__name__ not in {
+                "Qwen2_5_VLProcessor",
+                "Qwen3VLProcessor",
+            }:
+                # Note: for qwen-vl, processor has some reshape issue because of dims restriction on Ascend.
+                kwargs["device"] = "npu"
         result = processor.__call__(
             text=[input_text],
             padding=True,
             return_tensors="pt",
             **kwargs,
         )
-        # move feature tensors to cpu
-        for feature_name in self.FEATURE_NAMES:
-            if feature_name in result and isinstance(
-                result[feature_name], torch.Tensor
-            ):
-                result[feature_name] = result[feature_name].to("cpu")
+        if not self.server_args.keep_mm_feature_on_device:
+            # move feature tensors to cpu
+            for feature_name in self.FEATURE_NAMES:
+                if feature_name in result and isinstance(
+                    result[feature_name], torch.Tensor
+                ):
+                    result[feature_name] = result[feature_name].to("cpu")
         return result

sglang/srt/multimodal/processors/dots_vlm.py CHANGED Viewed

@@ -5,6 +5,7 @@ from typing import Dict, List, Union
 from PIL import Image
+from sglang.srt.models.dots_ocr import DotsOCRForCausalLM
 from sglang.srt.models.dots_vlm import DotsVLMForCausalLM
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
@@ -14,7 +15,7 @@ from sglang.srt.multimodal.processors.qwen_vl import resize_image_async
 class DotsVLMImageProcessor(BaseMultimodalProcessor):
-    models = [DotsVLMForCausalLM]
+    models = [DotsVLMForCausalLM, DotsOCRForCausalLM]
     def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         super().__init__(hf_config, server_args, _processor, *args, **kwargs)
@@ -82,11 +83,9 @@ class DotsVLMImageProcessor(BaseMultimodalProcessor):
                 for image in base_output.images
             ]
             base_output.images = await asyncio.gather(*resize_tasks)
         combined_mm_item, input_ids, _ = self.process_and_combine_mm_data(
             base_output, self.mm_tokens
         )
         if combined_mm_item is None:
             return None

sglang/srt/multimodal/processors/internvl.py CHANGED Viewed

@@ -1,5 +1,7 @@
 # Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
+from functools import lru_cache
 import numpy as np
 import torch
 import torchvision.transforms as T
@@ -19,6 +21,20 @@ from sglang.srt.multimodal.processors.base_processor import (
 class InternVLImageProcessor(BaseMultimodalProcessor):
     models = [InternVLChatModel, InternS1ForConditionalGeneration]
+    IMAGENET_MEAN = [0.485, 0.456, 0.406]
+    IMAGENET_STD = [0.229, 0.224, 0.225]
+    @staticmethod
+    @lru_cache(maxsize=1)
+    def _get_normalize_tensors(device="cuda", dtype=torch.float32):
+        mean = torch.tensor(
+            InternVLImageProcessor.IMAGENET_MEAN, device=device, dtype=dtype
+        ).view(-1, 1, 1)
+        std = torch.tensor(
+            InternVLImageProcessor.IMAGENET_STD, device=device, dtype=dtype
+        ).view(-1, 1, 1)
+        return mean, std
     def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
         super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
         image_size = (
@@ -88,6 +104,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
             bound, fps, max_frame, first_idx=0, num_segments=num_segments
         )
+        mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
         for frame_index in frame_indices:
             # Load frame
             frame = vr[frame_index]
@@ -97,10 +115,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
                 img_np = frame.asnumpy()
                 img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
-            # Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
-            mean = img.mean(dim=[1, 2], keepdim=True)
-            # Prevent division by zero; clamp to minimum value of 1e-6
-            std = img.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
             img = (img - mean) / std
             tiles = InternVLImageProcessor.dynamic_preprocess(
@@ -188,6 +202,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         num_patches_list = []
         pixel_values = []
+        mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
         # Process each input with allocated frames
         for image_index, image in enumerate(base_output.images):
             try:
@@ -201,10 +217,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
                 else:
                     tensor = image.cuda()  # assume already tensor
-                # Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
-                mean = tensor.mean(dim=[1, 2], keepdim=True)
-                # Prevent division by zero; clamp to minimum value of 1e-6
-                std = tensor.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
                 tensor = (tensor - mean) / std
                 tiles = self.dynamic_preprocess(
                     tensor, image_size=448, max_num=12, use_thumbnail=True

sglang/srt/multimodal/processors/qwen_vl.py CHANGED Viewed

@@ -12,6 +12,8 @@ from torchvision.transforms import InterpolationMode
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
 from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
 from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
+from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration
+from sglang.srt.models.qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
 )
@@ -209,7 +211,12 @@ async def preprocess_video(
 # Compatible with Qwen2VL and Qwen2_5VL
 class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
-    models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration]
+    models = [
+        Qwen2VLForConditionalGeneration,
+        Qwen2_5_VLForConditionalGeneration,
+        Qwen3VLForConditionalGeneration,
+        Qwen3VLMoeForConditionalGeneration,
+    ]
     def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
         super().__init__(hf_config, server_args, _processor, *args, **kwargs)

sglang/srt/multimodal/processors/sarashina2_vision.py ADDED Viewed

@@ -0,0 +1,81 @@
+from typing import List, Union
+from sglang.srt.models.sarashina2_vision import Sarashina2VisionForCausalLM
+from sglang.srt.multimodal.processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+class Sarashina2VisionProcessor(BaseMultimodalProcessor):
+    models = [Sarashina2VisionForCausalLM]
+    def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+        super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+        # Sarashina2Vision specific tokens (default is <|file|>)
+        self.IMAGE_TOKEN = "<|file|>"
+        self.IM_TOKEN_ID = getattr(hf_config, "image_token_index", 14)
+        self.IM_START_ID = getattr(hf_config, "start_image_token_index", 102397)
+        self.IM_END_ID = getattr(hf_config, "end_image_token_index", 102398)
+        self.mm_tokens = MultimodalSpecialTokens(
+            image_token=self.IMAGE_TOKEN,
+            image_token_id=self.IM_TOKEN_ID,
+        ).build(_processor)
+        # Patch the processor's image processor to handle parameter compatibility
+        if hasattr(_processor, "image_processor") and hasattr(
+            _processor.image_processor, "_preprocess"
+        ):
+            original_preprocess = _processor.image_processor._preprocess
+            def patched_preprocess(*args, **kwargs):
+                # Filter kwargs to only include parameters that the custom _preprocess method accepts
+                # Based on Sarashina2VisionImageProcessor._preprocess signature
+                allowed_params = {
+                    "do_resize",
+                    "resample",
+                    "do_rescale",
+                    "rescale_factor",
+                    "do_normalize",
+                    "image_mean",
+                    "image_std",
+                    "do_convert_rgb",
+                    "data_format",
+                    "input_data_format",
+                }
+                filtered_kwargs = {
+                    k: v for k, v in kwargs.items() if k in allowed_params
+                }
+                return original_preprocess(*args, **filtered_kwargs)
+            _processor.image_processor._preprocess = patched_preprocess
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        *args,
+        **kwargs,
+    ):
+        """Process image data for Sarashina2Vision model using standard SGLang pattern."""
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=self.mm_tokens,
+        )
+        mm_items, input_ids, ret = self.process_and_combine_mm_data(
+            base_output=base_output,
+            mm_tokens=self.mm_tokens,
+        )
+        return {
+            "mm_items": mm_items,
+            "input_ids": input_ids.tolist(),
+            "im_token_id": self.mm_tokens.image_token_id,
+            "im_start_id": self.IM_START_ID,
+            "im_end_id": self.IM_END_ID,
+        }

sglang/srt/parser/jinja_template_utils.py CHANGED Viewed

@@ -89,6 +89,12 @@ def detect_jinja_template_content_format(chat_template: str) -> str:
     - If template has loops like {%- for content in message['content'] -%} → 'openai'
     - Otherwise → 'string'
     """
+    # Shortcut for multimodal templates
+    if any(
+        keyword in chat_template for keyword in ["image", "audio", "video", "vision"]
+    ):
+        return "openai"
     jinja_ast = _try_extract_ast(chat_template)
     if jinja_ast is None:
         return "string"

sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.3rc2py3-none-any.whl