sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +113 -17
- sglang/compile_deep_gemm.py +8 -1
- sglang/global_config.py +5 -1
- sglang/srt/configs/model_config.py +35 -0
- sglang/srt/conversation.py +9 -117
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +6 -1
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
- sglang/srt/disaggregation/mooncake/conn.py +243 -135
- sglang/srt/disaggregation/prefill.py +3 -0
- sglang/srt/distributed/device_communicators/pynccl.py +7 -0
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
- sglang/srt/distributed/parallel_state.py +22 -9
- sglang/srt/entrypoints/context.py +244 -0
- sglang/srt/entrypoints/engine.py +8 -5
- sglang/srt/entrypoints/harmony_utils.py +370 -0
- sglang/srt/entrypoints/http_server.py +106 -15
- sglang/srt/entrypoints/openai/protocol.py +227 -1
- sglang/srt/entrypoints/openai/serving_chat.py +278 -42
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +174 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_distribution.py +4 -2
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/harmony_tool_parser.py +130 -0
- sglang/srt/hf_transformers_utils.py +55 -13
- sglang/srt/jinja_template_utils.py +8 -1
- sglang/srt/layers/attention/aiter_backend.py +5 -8
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +7 -11
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
- sglang/srt/layers/attention/vision.py +40 -15
- sglang/srt/layers/communicator.py +35 -8
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/linear.py +9 -8
- sglang/srt/layers/logits_processor.py +9 -1
- sglang/srt/layers/moe/cutlass_moe.py +20 -6
- sglang/srt/layers/moe/ep_moe/layer.py +87 -107
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
- sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
- sglang/srt/layers/moe/topk.py +12 -3
- sglang/srt/layers/moe/utils.py +59 -0
- sglang/srt/layers/quantization/__init__.py +22 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +8 -7
- sglang/srt/layers/quantization/fp8_kernel.py +0 -4
- sglang/srt/layers/quantization/fp8_utils.py +29 -0
- sglang/srt/layers/quantization/modelopt_quant.py +259 -64
- sglang/srt/layers/quantization/mxfp4.py +651 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/__init__.py +0 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +225 -1
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +15 -4
- sglang/srt/lora/lora_manager.py +70 -14
- sglang/srt/lora/lora_registry.py +10 -2
- sglang/srt/lora/mem_pool.py +43 -5
- sglang/srt/managers/cache_controller.py +61 -32
- sglang/srt/managers/data_parallel_controller.py +52 -2
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +21 -4
- sglang/srt/managers/mm_utils.py +5 -11
- sglang/srt/managers/schedule_batch.py +30 -8
- sglang/srt/managers/schedule_policy.py +3 -1
- sglang/srt/managers/scheduler.py +170 -18
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +59 -22
- sglang/srt/managers/tokenizer_manager.py +137 -67
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/managers/utils.py +45 -1
- sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
- sglang/srt/mem_cache/hicache_storage.py +13 -21
- sglang/srt/mem_cache/hiradix_cache.py +53 -5
- sglang/srt/mem_cache/memory_pool_host.py +1 -1
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
- sglang/srt/model_executor/cuda_graph_runner.py +24 -9
- sglang/srt/model_executor/forward_batch_info.py +48 -17
- sglang/srt/model_executor/model_runner.py +24 -2
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +95 -50
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma3n_mm.py +39 -0
- sglang/srt/models/glm4_moe.py +102 -27
- sglang/srt/models/gpt_oss.py +1134 -0
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/llama4.py +13 -2
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mllama4.py +428 -19
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +7 -4
- sglang/srt/models/qwen3_moe.py +39 -14
- sglang/srt/models/step3_vl.py +10 -1
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +4 -3
- sglang/srt/multimodal/processors/gemma3n.py +0 -7
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/operations_strategy.py +1 -1
- sglang/srt/reasoning_parser.py +18 -39
- sglang/srt/server_args.py +218 -23
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
- sglang/srt/two_batch_overlap.py +163 -9
- sglang/srt/utils.py +41 -26
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/runners.py +4 -4
- sglang/test/test_utils.py +4 -4
- sglang/version.py +1 -1
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
sglang/srt/models/grok.py
CHANGED
@@ -29,6 +29,7 @@ from torch import nn
|
|
29
29
|
from transformers import PretrainedConfig
|
30
30
|
|
31
31
|
from sglang.srt.distributed import (
|
32
|
+
get_moe_expert_parallel_world_size,
|
32
33
|
get_tensor_model_parallel_rank,
|
33
34
|
get_tensor_model_parallel_world_size,
|
34
35
|
tensor_model_parallel_all_gather,
|
@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module):
|
|
117
118
|
)
|
118
119
|
|
119
120
|
kwargs = {}
|
120
|
-
if
|
121
|
+
if get_moe_expert_parallel_world_size() > 1:
|
121
122
|
MoEImpl = EPMoE
|
122
123
|
else:
|
123
124
|
MoEImpl = FusedMoE
|
@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module):
|
|
616
617
|
|
617
618
|
# Params for weights, fp8 weight scales, fp8 activation scales
|
618
619
|
# (param_name, weight_name, expert_id, shard_id)
|
619
|
-
|
620
|
-
expert_params_mapping = MoEImpl.make_expert_params_mapping(
|
620
|
+
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
621
621
|
ckpt_gate_proj_name="w1",
|
622
622
|
ckpt_down_proj_name="w2",
|
623
623
|
ckpt_up_proj_name="w3",
|
sglang/srt/models/llama4.py
CHANGED
@@ -241,13 +241,22 @@ class Llama4Attention(nn.Module):
|
|
241
241
|
if self.use_qk_norm
|
242
242
|
else None
|
243
243
|
)
|
244
|
+
|
245
|
+
qkv_quant_config = quant_config
|
246
|
+
o_quant_config = quant_config
|
247
|
+
if quant_config and hasattr(quant_config, "ignore") and quant_config.ignore:
|
248
|
+
if add_prefix("q_proj", prefix) in quant_config.ignore:
|
249
|
+
qkv_quant_config = None
|
250
|
+
if add_prefix("o_proj", prefix) in quant_config.ignore:
|
251
|
+
o_quant_config = None
|
252
|
+
|
244
253
|
self.qkv_proj = QKVParallelLinear(
|
245
254
|
hidden_size=hidden_size,
|
246
255
|
head_size=self.head_dim,
|
247
256
|
total_num_heads=self.total_num_heads,
|
248
257
|
total_num_kv_heads=self.total_num_kv_heads,
|
249
258
|
bias=bias,
|
250
|
-
quant_config=
|
259
|
+
quant_config=qkv_quant_config,
|
251
260
|
prefix=add_prefix("qkv_proj", prefix),
|
252
261
|
tp_rank=attn_tp_rank,
|
253
262
|
tp_size=attn_tp_size,
|
@@ -257,7 +266,7 @@ class Llama4Attention(nn.Module):
|
|
257
266
|
input_size=self.total_num_heads * self.head_dim,
|
258
267
|
output_size=hidden_size,
|
259
268
|
bias=bias_o_proj,
|
260
|
-
quant_config=
|
269
|
+
quant_config=o_quant_config,
|
261
270
|
prefix=add_prefix("o_proj", prefix),
|
262
271
|
tp_rank=attn_tp_rank,
|
263
272
|
tp_size=attn_tp_size,
|
@@ -406,6 +415,8 @@ class Llama4DecoderLayer(nn.Module):
|
|
406
415
|
)
|
407
416
|
|
408
417
|
def _is_moe_layer(self, layer_id: int) -> bool:
|
418
|
+
if self.config.interleave_moe_layer_step == 0:
|
419
|
+
return self.config.num_local_experts > 0
|
409
420
|
return (layer_id + 1) % self.config.interleave_moe_layer_step == 0
|
410
421
|
|
411
422
|
def forward(
|
sglang/srt/models/mixtral.py
CHANGED
@@ -24,6 +24,7 @@ from torch import nn
|
|
24
24
|
from transformers import MixtralConfig
|
25
25
|
|
26
26
|
from sglang.srt.distributed import (
|
27
|
+
get_moe_expert_parallel_world_size,
|
27
28
|
get_pp_group,
|
28
29
|
get_tensor_model_parallel_world_size,
|
29
30
|
tensor_model_parallel_all_reduce,
|
@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module):
|
|
94
95
|
renormalize=True,
|
95
96
|
)
|
96
97
|
|
97
|
-
MoEImpl = EPMoE if
|
98
|
+
MoEImpl = EPMoE if get_moe_expert_parallel_world_size() > 1 else FusedMoE
|
98
99
|
self.experts = MoEImpl(
|
99
100
|
num_experts=num_experts,
|
100
101
|
top_k=top_k,
|
@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module):
|
|
398
399
|
|
399
400
|
# Params for weights, fp8 weight scales, fp8 activation scales
|
400
401
|
# (param_name, weight_name, expert_id, shard_id)
|
401
|
-
|
402
|
-
expert_params_mapping = MoEImpl.make_expert_params_mapping(
|
402
|
+
expert_params_mapping = FusedMoE.make_expert_params_mapping(
|
403
403
|
ckpt_gate_proj_name="w1",
|
404
404
|
ckpt_down_proj_name="w2",
|
405
405
|
ckpt_up_proj_name="w3",
|