sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +119 -17
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +42 -7
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +14 -4
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +286 -160
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +15 -11
- sglang/srt/entrypoints/context.py +227 -0
- sglang/srt/entrypoints/engine.py +15 -9
- sglang/srt/entrypoints/harmony_utils.py +372 -0
- sglang/srt/entrypoints/http_server.py +74 -4
- sglang/srt/entrypoints/openai/protocol.py +218 -1
- sglang/srt/entrypoints/openai/serving_chat.py +41 -11
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +175 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +14 -1
- sglang/srt/layers/attention/aiter_backend.py +375 -115
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +22 -6
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +29 -14
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +3 -7
- sglang/srt/layers/moe/cutlass_moe.py +12 -3
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +135 -73
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +16 -4
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +27 -3
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +51 -10
- sglang/srt/layers/quantization/modelopt_quant.py +258 -68
- sglang/srt/layers/quantization/mxfp4.py +654 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +21 -12
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +506 -3
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +82 -62
- sglang/srt/lora/lora_registry.py +23 -11
- sglang/srt/lora/mem_pool.py +63 -68
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +75 -58
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -8
- sglang/srt/managers/mm_utils.py +6 -13
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +61 -25
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +41 -19
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +47 -30
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +80 -22
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +34 -36
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -9
- sglang/srt/model_executor/forward_batch_info.py +61 -19
- sglang/srt/model_executor/model_runner.py +148 -37
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +137 -59
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +38 -0
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +28 -16
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +1251 -0
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +332 -37
- sglang/srt/server_args.py +186 -75
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +169 -9
- sglang/srt/utils.py +41 -5
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/runners.py +2 -2
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,18 @@ from __future__ import annotations
|
|
3
3
|
import importlib
|
4
4
|
import sys
|
5
5
|
from types import MappingProxyType
|
6
|
-
from typing import
|
6
|
+
from typing import (
|
7
|
+
TYPE_CHECKING,
|
8
|
+
Any,
|
9
|
+
Callable,
|
10
|
+
Dict,
|
11
|
+
List,
|
12
|
+
Mapping,
|
13
|
+
Optional,
|
14
|
+
Tuple,
|
15
|
+
Union,
|
16
|
+
cast,
|
17
|
+
)
|
7
18
|
|
8
19
|
import torch
|
9
20
|
from torch.nn.parameter import Parameter
|
@@ -79,22 +90,16 @@ def npu_wrapper_rmsnorm_forward(func):
|
|
79
90
|
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
80
91
|
if not x.is_contiguous():
|
81
92
|
x = x.contiguous()
|
82
|
-
original_dtype = x.dtype
|
83
|
-
x = x.to(torch.float32)
|
84
93
|
if residual is not None:
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
x, self.weight.to(torch.float32), self.variance_epsilon
|
91
|
-
)[0]
|
92
|
-
+ self.bias
|
93
|
-
)
|
94
|
+
out, _, residual_out = torch_npu.npu_add_rms_norm(
|
95
|
+
residual, x, self.weight.data, self.variance_epsilon
|
96
|
+
)
|
97
|
+
out = out + self.bias
|
98
|
+
return out.to(x.dtype), residual_out
|
94
99
|
|
95
|
-
|
96
|
-
|
97
|
-
return
|
100
|
+
out = torch_npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
|
101
|
+
out = out + self.bias
|
102
|
+
return out.to(x.dtype)
|
98
103
|
|
99
104
|
return _rmsnorm_forward_oot
|
100
105
|
|
@@ -250,17 +255,23 @@ class W8A8Int8Config(QuantizationConfig):
|
|
250
255
|
|
251
256
|
if _is_npu:
|
252
257
|
if isinstance(layer, LinearBase):
|
258
|
+
key = "model"
|
259
|
+
if "vision_model" in prefix:
|
260
|
+
key = "vision_model"
|
261
|
+
elif "visual" in prefix:
|
262
|
+
key = "visual"
|
263
|
+
packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {})
|
253
264
|
prefix_in_quant_config = prefix
|
254
265
|
proj_name = prefix.split(".")[-1]
|
255
|
-
if proj_name in
|
266
|
+
if proj_name in packed_modules_mapping_subset:
|
256
267
|
prefix_in_quant_config = prefix.replace(
|
257
|
-
proj_name,
|
268
|
+
proj_name, packed_modules_mapping_subset[proj_name][0]
|
258
269
|
)
|
259
270
|
self.is_dynamic = (
|
260
271
|
self.quant_description[prefix_in_quant_config + ".weight"]
|
261
272
|
== "W8A8_DYNAMIC"
|
262
273
|
)
|
263
|
-
if self.is_layer_skipped(prefix,
|
274
|
+
if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
|
264
275
|
return UnquantizedLinearMethod()
|
265
276
|
return (
|
266
277
|
NPU_W8A8DynamicLinearMethod(self)
|
@@ -571,8 +582,10 @@ class NPU_W8A8LinearMethodImpl:
|
|
571
582
|
layer: torch.nn.Module,
|
572
583
|
x: torch.Tensor,
|
573
584
|
bias: Optional[torch.Tensor] = None,
|
574
|
-
tp_rank: Optional[int] = 0,
|
575
585
|
) -> torch.Tensor:
|
586
|
+
# To prevent import loops
|
587
|
+
from sglang.srt.layers.linear import RowParallelLinear
|
588
|
+
|
576
589
|
original_dtype = x.dtype
|
577
590
|
if original_dtype != torch.int8:
|
578
591
|
x = torch_npu.npu_quantize(
|
@@ -583,8 +596,12 @@ class NPU_W8A8LinearMethodImpl:
|
|
583
596
|
-1,
|
584
597
|
True,
|
585
598
|
)
|
586
|
-
|
587
|
-
|
599
|
+
# Only fuse bias add into GEMM for rank 0 (this ensures that
|
600
|
+
# bias will not get added more than once in Attention TP>1 case)
|
601
|
+
if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
|
602
|
+
quant_bias = None
|
603
|
+
else:
|
604
|
+
quant_bias = layer.quant_bias
|
588
605
|
return torch_npu.npu_quant_matmul(
|
589
606
|
x,
|
590
607
|
layer.weight,
|
@@ -651,13 +668,21 @@ class NPU_W8A8LinearMethodMTImpl:
|
|
651
668
|
layer: torch.nn.Module,
|
652
669
|
x: torch.Tensor,
|
653
670
|
bias: Optional[torch.Tensor] = None,
|
654
|
-
tp_rank: Optional[int] = 0,
|
655
671
|
) -> torch.Tensor:
|
672
|
+
# To prevent import loops
|
673
|
+
from sglang.srt.layers.linear import RowParallelLinear
|
674
|
+
|
656
675
|
original_dtype = x.dtype
|
657
676
|
if original_dtype != torch.int8:
|
658
677
|
x = quant_per_tensor(x, layer.input_scale, layer.input_offset)
|
659
678
|
|
660
|
-
|
679
|
+
# Only fuse bias add into GEMM for rank 0 (this ensures that
|
680
|
+
# bias will not get added more than once in Attention TP>1 case)
|
681
|
+
if isinstance(layer, RowParallelLinear) and layer.tp_rank > 0:
|
682
|
+
quant_bias = None
|
683
|
+
else:
|
684
|
+
quant_bias = layer.quant_bias
|
685
|
+
|
661
686
|
return ops.quant_matmul(
|
662
687
|
x=x, weight=layer.weight, deq_scale=layer.deq_scale, deq_bias=quant_bias
|
663
688
|
)
|
@@ -737,11 +762,6 @@ class NPU_W8A8LinearMethod(LinearMethodBase):
|
|
737
762
|
x: torch.Tensor,
|
738
763
|
bias: Optional[torch.Tensor] = None,
|
739
764
|
) -> torch.Tensor:
|
740
|
-
from sglang.srt.layers.linear import RowParallelLinear
|
741
|
-
|
742
|
-
if isinstance(layer, RowParallelLinear):
|
743
|
-
tp_rank = get_tensor_model_parallel_rank()
|
744
|
-
return self.quant_method.apply(layer, x, bias, tp_rank)
|
745
765
|
return self.quant_method.apply(layer, x, bias)
|
746
766
|
|
747
767
|
|
@@ -780,7 +800,6 @@ class NPU_W8A8DynamicLinearMethodImpl:
|
|
780
800
|
tp_rank: Optional[int] = 0,
|
781
801
|
) -> torch.Tensor:
|
782
802
|
original_dtype = x.dtype
|
783
|
-
# use ATB quantize
|
784
803
|
quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
|
785
804
|
return torch_npu.npu_quant_matmul(
|
786
805
|
quant_out,
|
@@ -863,11 +882,6 @@ class NPU_W8A8DynamicLinearMethod(LinearMethodBase):
|
|
863
882
|
x: torch.Tensor,
|
864
883
|
bias: Optional[torch.Tensor] = None,
|
865
884
|
) -> torch.Tensor:
|
866
|
-
from sglang.srt.layers.linear import RowParallelLinear
|
867
|
-
|
868
|
-
if isinstance(layer, RowParallelLinear):
|
869
|
-
tp_rank = get_tensor_model_parallel_rank()
|
870
|
-
return self.quant_method.apply(layer, x, bias, tp_rank)
|
871
885
|
return self.quant_method.apply(layer, x, bias)
|
872
886
|
|
873
887
|
|