sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +113 -17
- sglang/compile_deep_gemm.py +8 -1
- sglang/global_config.py +5 -1
- sglang/srt/configs/model_config.py +35 -0
- sglang/srt/conversation.py +9 -117
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +6 -1
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
- sglang/srt/disaggregation/mooncake/conn.py +243 -135
- sglang/srt/disaggregation/prefill.py +3 -0
- sglang/srt/distributed/device_communicators/pynccl.py +7 -0
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
- sglang/srt/distributed/parallel_state.py +22 -9
- sglang/srt/entrypoints/context.py +244 -0
- sglang/srt/entrypoints/engine.py +8 -5
- sglang/srt/entrypoints/harmony_utils.py +370 -0
- sglang/srt/entrypoints/http_server.py +106 -15
- sglang/srt/entrypoints/openai/protocol.py +227 -1
- sglang/srt/entrypoints/openai/serving_chat.py +278 -42
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +174 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_distribution.py +4 -2
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/harmony_tool_parser.py +130 -0
- sglang/srt/hf_transformers_utils.py +55 -13
- sglang/srt/jinja_template_utils.py +8 -1
- sglang/srt/layers/attention/aiter_backend.py +5 -8
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +7 -11
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
- sglang/srt/layers/attention/vision.py +40 -15
- sglang/srt/layers/communicator.py +35 -8
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/linear.py +9 -8
- sglang/srt/layers/logits_processor.py +9 -1
- sglang/srt/layers/moe/cutlass_moe.py +20 -6
- sglang/srt/layers/moe/ep_moe/layer.py +87 -107
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
- sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
- sglang/srt/layers/moe/topk.py +12 -3
- sglang/srt/layers/moe/utils.py +59 -0
- sglang/srt/layers/quantization/__init__.py +22 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +8 -7
- sglang/srt/layers/quantization/fp8_kernel.py +0 -4
- sglang/srt/layers/quantization/fp8_utils.py +29 -0
- sglang/srt/layers/quantization/modelopt_quant.py +259 -64
- sglang/srt/layers/quantization/mxfp4.py +651 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/__init__.py +0 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +225 -1
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +15 -4
- sglang/srt/lora/lora_manager.py +70 -14
- sglang/srt/lora/lora_registry.py +10 -2
- sglang/srt/lora/mem_pool.py +43 -5
- sglang/srt/managers/cache_controller.py +61 -32
- sglang/srt/managers/data_parallel_controller.py +52 -2
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +21 -4
- sglang/srt/managers/mm_utils.py +5 -11
- sglang/srt/managers/schedule_batch.py +30 -8
- sglang/srt/managers/schedule_policy.py +3 -1
- sglang/srt/managers/scheduler.py +170 -18
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +59 -22
- sglang/srt/managers/tokenizer_manager.py +137 -67
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/managers/utils.py +45 -1
- sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
- sglang/srt/mem_cache/hicache_storage.py +13 -21
- sglang/srt/mem_cache/hiradix_cache.py +53 -5
- sglang/srt/mem_cache/memory_pool_host.py +1 -1
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
- sglang/srt/model_executor/cuda_graph_runner.py +24 -9
- sglang/srt/model_executor/forward_batch_info.py +48 -17
- sglang/srt/model_executor/model_runner.py +24 -2
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +95 -50
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma3n_mm.py +39 -0
- sglang/srt/models/glm4_moe.py +102 -27
- sglang/srt/models/gpt_oss.py +1134 -0
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/llama4.py +13 -2
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mllama4.py +428 -19
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +7 -4
- sglang/srt/models/qwen3_moe.py +39 -14
- sglang/srt/models/step3_vl.py +10 -1
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +4 -3
- sglang/srt/multimodal/processors/gemma3n.py +0 -7
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/operations_strategy.py +1 -1
- sglang/srt/reasoning_parser.py +18 -39
- sglang/srt/server_args.py +218 -23
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
- sglang/srt/two_batch_overlap.py +163 -9
- sglang/srt/utils.py +41 -26
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/runners.py +4 -4
- sglang/test/test_utils.py +4 -4
- sglang/version.py +1 -1
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
148
148
|
**(
|
149
149
|
dict(
|
150
150
|
enable_flashinfer_cutlass_moe=True,
|
151
|
-
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
152
151
|
)
|
153
152
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
154
153
|
else {}
|
@@ -211,6 +210,7 @@ class Qwen2MoeAttention(nn.Module):
|
|
211
210
|
max_position_embeddings: int = 8192,
|
212
211
|
qkv_bias: int = True,
|
213
212
|
quant_config: Optional[QuantizationConfig] = None,
|
213
|
+
dual_chunk_attention_config: Optional[dict[str, Any]] = None,
|
214
214
|
prefix: str = "",
|
215
215
|
) -> None:
|
216
216
|
super().__init__()
|
@@ -268,6 +268,7 @@ class Qwen2MoeAttention(nn.Module):
|
|
268
268
|
max_position=max_position_embeddings,
|
269
269
|
base=rope_theta,
|
270
270
|
rope_scaling=rope_scaling,
|
271
|
+
dual_chunk_attention_config=dual_chunk_attention_config,
|
271
272
|
)
|
272
273
|
self.attn = RadixAttention(
|
273
274
|
self.num_heads,
|
@@ -309,6 +310,9 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
309
310
|
rope_scaling = getattr(config, "rope_scaling", None)
|
310
311
|
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
311
312
|
qkv_bias = getattr(config, "qkv_bias", True)
|
313
|
+
dual_chunk_attention_config = getattr(
|
314
|
+
config, "dual_chunk_attention_config", None
|
315
|
+
)
|
312
316
|
self.self_attn = Qwen2MoeAttention(
|
313
317
|
hidden_size=self.hidden_size,
|
314
318
|
num_heads=config.num_attention_heads,
|
@@ -318,6 +322,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
318
322
|
rope_scaling=rope_scaling,
|
319
323
|
max_position_embeddings=max_position_embeddings,
|
320
324
|
quant_config=quant_config,
|
325
|
+
dual_chunk_attention_config=dual_chunk_attention_config,
|
321
326
|
qkv_bias=qkv_bias,
|
322
327
|
prefix=add_prefix("self_attn", prefix),
|
323
328
|
)
|
@@ -616,9 +621,7 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
616
621
|
("gate_up_proj", "up_proj", 1),
|
617
622
|
]
|
618
623
|
|
619
|
-
|
620
|
-
|
621
|
-
expert_params_mapping = MoEImpl.make_expert_params_mapping(
|
624
|
+
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
622
625
|
ckpt_gate_proj_name="gate_proj",
|
623
626
|
ckpt_down_proj_name="down_proj",
|
624
627
|
ckpt_up_proj_name="up_proj",
|
sglang/srt/models/qwen3_moe.py
CHANGED
@@ -24,6 +24,7 @@ import torch
|
|
24
24
|
from torch import nn
|
25
25
|
|
26
26
|
from sglang.srt.distributed import (
|
27
|
+
get_moe_expert_parallel_world_size,
|
27
28
|
get_pp_group,
|
28
29
|
get_tensor_model_parallel_rank,
|
29
30
|
get_tensor_model_parallel_world_size,
|
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
|
|
51
52
|
)
|
52
53
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
53
54
|
from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
|
54
|
-
from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
|
55
55
|
from sglang.srt.layers.moe.topk import TopK
|
56
56
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
57
57
|
from sglang.srt.layers.radix_attention import RadixAttention
|
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
|
|
72
72
|
from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
|
73
73
|
from sglang.srt.models.qwen2_moe import Qwen2MoeModel
|
74
74
|
from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
|
75
|
-
from sglang.srt.utils import
|
75
|
+
from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty
|
76
76
|
|
77
77
|
Qwen3MoeConfig = None
|
78
78
|
|
@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
113
113
|
quant_config=quant_config,
|
114
114
|
prefix=add_prefix("experts", prefix),
|
115
115
|
**(
|
116
|
-
dict(deepep_mode=
|
117
|
-
if global_server_args_dict["
|
116
|
+
dict(deepep_mode=global_server_args_dict["deepep_mode"])
|
117
|
+
if global_server_args_dict["moe_a2a_backend"].is_deepep()
|
118
118
|
else {}
|
119
119
|
),
|
120
120
|
# Additional args for FusedMoE
|
121
121
|
**(
|
122
122
|
dict(
|
123
123
|
enable_flashinfer_cutlass_moe=True,
|
124
|
-
enable_ep_moe=global_server_args_dict["enable_ep_moe"],
|
125
124
|
)
|
126
125
|
if global_server_args_dict["enable_flashinfer_cutlass_moe"]
|
127
126
|
else {}
|
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
136
135
|
prefix=add_prefix("gate", prefix),
|
137
136
|
)
|
138
137
|
|
139
|
-
if global_server_args_dict["
|
138
|
+
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
140
139
|
# TODO: we will support tp < ep in the future
|
141
|
-
self.ep_size =
|
140
|
+
self.ep_size = get_moe_expert_parallel_world_size()
|
142
141
|
self.num_experts = (
|
143
142
|
config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
|
144
143
|
)
|
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
148
147
|
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
|
149
148
|
) -> torch.Tensor:
|
150
149
|
|
151
|
-
if not global_server_args_dict["
|
150
|
+
if not global_server_args_dict["moe_a2a_backend"].is_deepep():
|
152
151
|
return self.forward_normal(hidden_states)
|
153
152
|
else:
|
154
153
|
return self.forward_deepep(hidden_states, forward_batch)
|
@@ -296,6 +295,7 @@ class Qwen3MoeAttention(nn.Module):
|
|
296
295
|
attention_bias: bool = False,
|
297
296
|
quant_config: Optional[QuantizationConfig] = None,
|
298
297
|
prefix: str = "",
|
298
|
+
dual_chunk_attention_config: Optional[dict[str, Any]] = None,
|
299
299
|
alt_stream: Optional[torch.cuda.Stream] = None,
|
300
300
|
) -> None:
|
301
301
|
super().__init__()
|
@@ -354,6 +354,7 @@ class Qwen3MoeAttention(nn.Module):
|
|
354
354
|
max_position=max_position_embeddings,
|
355
355
|
base=rope_theta,
|
356
356
|
rope_scaling=rope_scaling,
|
357
|
+
dual_chunk_attention_config=dual_chunk_attention_config,
|
357
358
|
)
|
358
359
|
self.attn = RadixAttention(
|
359
360
|
self.num_heads,
|
@@ -459,6 +460,9 @@ class Qwen3MoeDecoderLayer(nn.Module):
|
|
459
460
|
)
|
460
461
|
rms_norm_eps = config.rms_norm_eps
|
461
462
|
attention_bias = config.attention_bias
|
463
|
+
dual_chunk_attention_config = getattr(
|
464
|
+
config, "dual_chunk_attention_config", None
|
465
|
+
)
|
462
466
|
self.self_attn = Qwen3MoeAttention(
|
463
467
|
hidden_size=self.hidden_size,
|
464
468
|
num_heads=config.num_attention_heads,
|
@@ -472,6 +476,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
|
|
472
476
|
attention_bias=attention_bias,
|
473
477
|
quant_config=quant_config,
|
474
478
|
prefix=add_prefix("self_attn", prefix),
|
479
|
+
dual_chunk_attention_config=dual_chunk_attention_config,
|
475
480
|
alt_stream=alt_stream,
|
476
481
|
)
|
477
482
|
|
@@ -767,7 +772,10 @@ class Qwen3MoeForCausalLM(nn.Module):
|
|
767
772
|
num_experts=self.config.num_experts,
|
768
773
|
)
|
769
774
|
|
770
|
-
params_dict
|
775
|
+
# Cache params_dict to avoid repeated expensive traversal of model parameters
|
776
|
+
if not hasattr(self, "_cached_params_dict"):
|
777
|
+
self._cached_params_dict = dict(self.named_parameters())
|
778
|
+
params_dict = self._cached_params_dict
|
771
779
|
for name, loaded_weight in weights:
|
772
780
|
layer_id = get_layer_id(name)
|
773
781
|
if (
|
@@ -806,11 +814,22 @@ class Qwen3MoeForCausalLM(nn.Module):
|
|
806
814
|
weight_loader(param, loaded_weight, shard_id)
|
807
815
|
break
|
808
816
|
else:
|
817
|
+
# Track if this is an expert weight to enable early skipping
|
818
|
+
is_expert_weight = False
|
819
|
+
|
809
820
|
for mapping in expert_params_mapping:
|
810
821
|
param_name, weight_name, expert_id, shard_id = mapping
|
811
822
|
if weight_name not in name:
|
812
823
|
continue
|
824
|
+
|
825
|
+
# Mark as expert weight regardless of whether we can process it
|
826
|
+
is_expert_weight = True
|
827
|
+
|
813
828
|
name = name.replace(weight_name, param_name)
|
829
|
+
if name not in params_dict:
|
830
|
+
# Expert weight not on this rank, will be skipped below
|
831
|
+
continue
|
832
|
+
|
814
833
|
param = params_dict[name]
|
815
834
|
weight_loader = param.weight_loader
|
816
835
|
weight_loader(
|
@@ -822,6 +841,10 @@ class Qwen3MoeForCausalLM(nn.Module):
|
|
822
841
|
)
|
823
842
|
break
|
824
843
|
else:
|
844
|
+
if is_expert_weight:
|
845
|
+
# This is an expert weight but not mapped to this rank, skip all remaining processing
|
846
|
+
continue
|
847
|
+
|
825
848
|
# Skip loading extra bias for GPTQ models.
|
826
849
|
if name.endswith(".bias") and name not in params_dict:
|
827
850
|
continue
|
@@ -838,11 +861,13 @@ class Qwen3MoeForCausalLM(nn.Module):
|
|
838
861
|
logger.warning(f"Parameter {name} not found in params_dict")
|
839
862
|
|
840
863
|
# TODO mimic deepseek
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
864
|
+
# Lazy initialization of expert weights cache to avoid slowing down load_weights
|
865
|
+
if not hasattr(self, "routed_experts_weights_of_layer"):
|
866
|
+
self.routed_experts_weights_of_layer = {
|
867
|
+
layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
|
868
|
+
for layer_id in range(self.start_layer, self.end_layer)
|
869
|
+
if isinstance(self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock)
|
870
|
+
}
|
846
871
|
|
847
872
|
@classmethod
|
848
873
|
def get_model_config_for_expert_location(cls, config):
|
sglang/srt/models/step3_vl.py
CHANGED
@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
|
|
146
146
|
prefix=add_prefix("gate", prefix),
|
147
147
|
)
|
148
148
|
|
149
|
-
if global_server_args_dict["
|
149
|
+
if global_server_args_dict["moe_a2a_backend"].is_deepep():
|
150
150
|
raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
|
151
151
|
|
152
152
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
@@ -531,11 +531,18 @@ class Step3VisionMLP(nn.Module):
|
|
531
531
|
prefix: str = "",
|
532
532
|
) -> None:
|
533
533
|
super().__init__()
|
534
|
+
# Since this is a dense model,
|
535
|
+
# the MLP component likewise adopts a DP-MLP approach modeled after DP Attention.
|
536
|
+
# This choice may not represent the optimal solution and remains open to further deliberation.
|
537
|
+
attn_tp_rank = get_attention_tp_rank()
|
538
|
+
attn_tp_size = get_attention_tp_size()
|
534
539
|
self.fc1 = ColumnParallelLinear(
|
535
540
|
dim,
|
536
541
|
intermediate_size,
|
537
542
|
bias=bias,
|
538
543
|
quant_config=quant_config,
|
544
|
+
tp_rank=attn_tp_rank,
|
545
|
+
tp_size=attn_tp_size,
|
539
546
|
prefix=add_prefix("gate_proj", prefix),
|
540
547
|
)
|
541
548
|
self.act = ACT2FN[hidden_act] # quick_gelu
|
@@ -544,6 +551,8 @@ class Step3VisionMLP(nn.Module):
|
|
544
551
|
dim,
|
545
552
|
bias=bias,
|
546
553
|
quant_config=quant_config,
|
554
|
+
tp_rank=attn_tp_rank,
|
555
|
+
tp_size=attn_tp_size,
|
547
556
|
prefix=add_prefix("down_proj", prefix),
|
548
557
|
)
|
549
558
|
|
@@ -211,16 +211,13 @@ class TransformersForCausalLM(nn.Module):
|
|
211
211
|
Apply the model's tensor parallelization plan.
|
212
212
|
Currently only supports linear layers.
|
213
213
|
"""
|
214
|
-
|
215
|
-
if tp_size <= 1:
|
216
|
-
return
|
214
|
+
tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
|
217
215
|
|
216
|
+
if not tp_plan and self.tp_size > 1:
|
218
217
|
raise ValueError(
|
219
218
|
f"{type(self.model)} does not support tensor parallel yet!"
|
220
219
|
)
|
221
220
|
|
222
|
-
tp_plan = self.model._tp_plan
|
223
|
-
|
224
221
|
def _tensor_parallel(module: nn.Module, prefix: str = ""):
|
225
222
|
for child_name, child_module in module.named_children():
|
226
223
|
qual_name = maybe_prefix(prefix, child_name)
|
@@ -12,7 +12,6 @@ import torch
|
|
12
12
|
from PIL import Image
|
13
13
|
from transformers import BaseImageProcessorFast
|
14
14
|
|
15
|
-
from sglang.srt.managers.mm_utils import TransportProxyTensor
|
16
15
|
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
17
16
|
from sglang.srt.utils import load_audio, load_image, load_video, logger
|
18
17
|
|
@@ -218,8 +217,10 @@ class BaseMultimodalProcessor(ABC):
|
|
218
217
|
kwargs["audio"] = audios
|
219
218
|
|
220
219
|
processor = self._processor
|
221
|
-
if
|
222
|
-
processor
|
220
|
+
if (
|
221
|
+
hasattr(processor, "image_processor")
|
222
|
+
and isinstance(processor.image_processor, BaseImageProcessorFast)
|
223
|
+
and not self.server_args.disable_fast_image_processor
|
223
224
|
):
|
224
225
|
kwargs["device"] = "cuda"
|
225
226
|
result = processor.__call__(
|
@@ -12,7 +12,6 @@
|
|
12
12
|
# limitations under the License.
|
13
13
|
# ==============================================================================
|
14
14
|
|
15
|
-
import re
|
16
15
|
from typing import Dict, List, Optional, Union
|
17
16
|
|
18
17
|
from sglang.srt.managers.multimodal_processor import (
|
@@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
|
|
38
37
|
self.mm_tokens = MultimodalSpecialTokens(
|
39
38
|
image_token="<image_soft_token>",
|
40
39
|
image_token_id=hf_config.image_token_id,
|
41
|
-
image_token_regex=re.compile(
|
42
|
-
r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
|
43
|
-
),
|
44
40
|
audio_token="<audio_soft_token>",
|
45
41
|
audio_token_id=hf_config.audio_token_id,
|
46
|
-
audio_token_regex=re.compile(
|
47
|
-
r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
|
48
|
-
),
|
49
42
|
).build(_processor)
|
50
43
|
|
51
44
|
async def process_mm_data_async(
|
@@ -8,7 +8,7 @@ import torch
|
|
8
8
|
from PIL import Image
|
9
9
|
from torchvision import transforms
|
10
10
|
from torchvision.transforms import InterpolationMode
|
11
|
-
from transformers import BatchFeature, TensorType
|
11
|
+
from transformers import BatchFeature, ProcessorMixin, TensorType
|
12
12
|
|
13
13
|
from sglang.srt.models.step3_vl import Step3VLForConditionalGeneration
|
14
14
|
from sglang.srt.multimodal.processors.base_processor import (
|
@@ -276,6 +276,8 @@ class Step3VLProcessor:
|
|
276
276
|
super().__init__()
|
277
277
|
|
278
278
|
self.config = config
|
279
|
+
if isinstance(tokenizer, ProcessorMixin):
|
280
|
+
tokenizer = tokenizer.tokenizer
|
279
281
|
self.tokenizer = tokenizer
|
280
282
|
|
281
283
|
self.image_size = 728
|
@@ -4,7 +4,7 @@ from typing import List, Optional
|
|
4
4
|
import torch
|
5
5
|
|
6
6
|
from sglang.srt import operations
|
7
|
-
from sglang.srt.layers.moe.
|
7
|
+
from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
|
8
8
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
9
9
|
from sglang.srt.operations import Operation
|
10
10
|
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -131,7 +131,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
131
131
|
If True, streams reasoning content as it arrives.
|
132
132
|
"""
|
133
133
|
|
134
|
-
def __init__(self, stream_reasoning: bool = True):
|
134
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = True):
|
135
135
|
# DeepSeek-R1 is assumed to be reasoning until `</think>` token
|
136
136
|
super().__init__(
|
137
137
|
"<think>",
|
@@ -144,7 +144,7 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
144
144
|
|
145
145
|
class Qwen3Detector(BaseReasoningFormatDetector):
|
146
146
|
"""
|
147
|
-
Detector for
|
147
|
+
Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
|
148
148
|
Assumes reasoning format:
|
149
149
|
(<think>)*(.*)</think>
|
150
150
|
|
@@ -153,47 +153,16 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
153
153
|
- enable_thinking=True: "<think>reasoning content</think>The answer is 42."
|
154
154
|
- enable_thinking=False: "The answer is 42." (no thinking tokens)
|
155
155
|
|
156
|
-
This detector handles both cases.
|
157
|
-
|
158
|
-
NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
159
|
-
Those models always generate thinking content without <think> start tags.
|
160
|
-
Use "qwen3-thinking" parser type for those models instead.
|
161
|
-
|
162
|
-
Args:
|
163
|
-
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
164
|
-
If True, streams reasoning content as it arrives.
|
165
|
-
"""
|
166
|
-
|
167
|
-
def __init__(self, stream_reasoning: bool = True):
|
168
|
-
super().__init__(
|
169
|
-
"<think>",
|
170
|
-
"</think>",
|
171
|
-
force_reasoning=False,
|
172
|
-
stream_reasoning=stream_reasoning,
|
173
|
-
)
|
174
|
-
|
175
|
-
|
176
|
-
class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
|
177
|
-
"""
|
178
|
-
Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
179
|
-
Assumes reasoning format:
|
180
|
-
*(.*)</think>
|
181
|
-
|
182
|
-
These models always generate thinking content without <think> start tag.
|
183
|
-
They do not support the enable_thinking parameter and always think.
|
184
|
-
|
185
|
-
Format: "I need to think about this...</think>The answer is 42."
|
186
|
-
|
187
156
|
Args:
|
188
157
|
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
189
158
|
If True, streams reasoning content as it arrives.
|
190
159
|
"""
|
191
160
|
|
192
|
-
def __init__(self, stream_reasoning: bool = True):
|
161
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
|
193
162
|
super().__init__(
|
194
163
|
"<think>",
|
195
164
|
"</think>",
|
196
|
-
force_reasoning=
|
165
|
+
force_reasoning=force_reasoning,
|
197
166
|
stream_reasoning=stream_reasoning,
|
198
167
|
)
|
199
168
|
|
@@ -207,7 +176,7 @@ class KimiDetector(BaseReasoningFormatDetector):
|
|
207
176
|
and the rest of the text as `normal_text`.
|
208
177
|
"""
|
209
178
|
|
210
|
-
def __init__(self, stream_reasoning: bool = True):
|
179
|
+
def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
|
211
180
|
super().__init__(
|
212
181
|
"◁think▷",
|
213
182
|
"◁/think▷",
|
@@ -230,13 +199,18 @@ class ReasoningParser:
|
|
230
199
|
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
|
231
200
|
"deepseek-r1": DeepSeekR1Detector,
|
232
201
|
"qwen3": Qwen3Detector,
|
233
|
-
"qwen3-thinking":
|
202
|
+
"qwen3-thinking": Qwen3Detector,
|
234
203
|
"glm45": Qwen3Detector,
|
235
204
|
"kimi": KimiDetector,
|
236
205
|
"step3": DeepSeekR1Detector,
|
237
206
|
}
|
238
207
|
|
239
|
-
def __init__(
|
208
|
+
def __init__(
|
209
|
+
self,
|
210
|
+
model_type: Optional[str] = None,
|
211
|
+
stream_reasoning: bool = True,
|
212
|
+
force_reasoning: bool = False,
|
213
|
+
):
|
240
214
|
if not model_type:
|
241
215
|
raise ValueError("Model type must be specified")
|
242
216
|
|
@@ -244,7 +218,12 @@ class ReasoningParser:
|
|
244
218
|
if not detector_class:
|
245
219
|
raise ValueError(f"Unsupported model type: {model_type}")
|
246
220
|
|
247
|
-
|
221
|
+
if model_type.lower() == "qwen3-thinking":
|
222
|
+
force_reasoning = True
|
223
|
+
|
224
|
+
self.detector = detector_class(
|
225
|
+
stream_reasoning=stream_reasoning, force_reasoning=force_reasoning
|
226
|
+
)
|
248
227
|
|
249
228
|
def parse_non_stream(self, full_text: str) -> Tuple[str, str]:
|
250
229
|
"""Non-streaming call: one-time parsing"""
|