sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +119 -17
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +42 -7
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +14 -4
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +286 -160
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +15 -11
- sglang/srt/entrypoints/context.py +227 -0
- sglang/srt/entrypoints/engine.py +15 -9
- sglang/srt/entrypoints/harmony_utils.py +372 -0
- sglang/srt/entrypoints/http_server.py +74 -4
- sglang/srt/entrypoints/openai/protocol.py +218 -1
- sglang/srt/entrypoints/openai/serving_chat.py +41 -11
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +175 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +14 -1
- sglang/srt/layers/attention/aiter_backend.py +375 -115
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +22 -6
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +29 -14
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +3 -7
- sglang/srt/layers/moe/cutlass_moe.py +12 -3
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +135 -73
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +16 -4
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +27 -3
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +51 -10
- sglang/srt/layers/quantization/modelopt_quant.py +258 -68
- sglang/srt/layers/quantization/mxfp4.py +654 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +21 -12
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +506 -3
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +82 -62
- sglang/srt/lora/lora_registry.py +23 -11
- sglang/srt/lora/mem_pool.py +63 -68
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +75 -58
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -8
- sglang/srt/managers/mm_utils.py +6 -13
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +61 -25
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +41 -19
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +47 -30
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +80 -22
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +34 -36
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -9
- sglang/srt/model_executor/forward_batch_info.py +61 -19
- sglang/srt/model_executor/model_runner.py +148 -37
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +137 -59
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +38 -0
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +28 -16
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +1251 -0
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +332 -37
- sglang/srt/server_args.py +186 -75
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +169 -9
- sglang/srt/utils.py +41 -5
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/runners.py +2 -2
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
@@ -1,131 +0,0 @@
|
|
1
|
-
from typing import Tuple
|
2
|
-
|
3
|
-
import torch
|
4
|
-
|
5
|
-
from sglang.srt.lora.backend.base_backend import BaseLoRABackend
|
6
|
-
from sglang.srt.lora.utils import LoRABatchInfo
|
7
|
-
from sglang.srt.utils import is_flashinfer_available
|
8
|
-
|
9
|
-
if is_flashinfer_available():
|
10
|
-
from flashinfer import SegmentGEMMWrapper
|
11
|
-
|
12
|
-
|
13
|
-
class FlashInferLoRABackend(BaseLoRABackend):
|
14
|
-
|
15
|
-
def __init__(self, name: str, batch_info: LoRABatchInfo = None):
|
16
|
-
super().__init__(name, batch_info)
|
17
|
-
|
18
|
-
# Set up SGemm Wrapper from flashinfer
|
19
|
-
# FIXME wait for flashinfer segment gemm update
|
20
|
-
workspace_buffer = torch.empty(1 * 1024 * 1024, dtype=torch.int8, device="cuda")
|
21
|
-
self.segment_gemm = SegmentGEMMWrapper(workspace_buffer)
|
22
|
-
|
23
|
-
def run_lora_a_sgemm(
|
24
|
-
self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
|
25
|
-
) -> torch.Tensor:
|
26
|
-
|
27
|
-
return self.segment_gemm.run(
|
28
|
-
x=x,
|
29
|
-
weights=weights,
|
30
|
-
batch_size=self.batch_info.bs,
|
31
|
-
weight_column_major=True,
|
32
|
-
seg_indptr=self.batch_info.seg_indptr,
|
33
|
-
weight_indices=self.batch_info.weight_indices,
|
34
|
-
)
|
35
|
-
|
36
|
-
def run_lora_b_sgemm(
|
37
|
-
self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
|
38
|
-
) -> torch.Tensor:
|
39
|
-
|
40
|
-
return (
|
41
|
-
self.segment_gemm.run(
|
42
|
-
x=x,
|
43
|
-
weights=weights,
|
44
|
-
batch_size=self.batch_info.bs,
|
45
|
-
weight_column_major=True,
|
46
|
-
seg_indptr=self.batch_info.seg_indptr,
|
47
|
-
weight_indices=self.batch_info.weight_indices,
|
48
|
-
)
|
49
|
-
* self.batch_info.scalings[0]
|
50
|
-
)
|
51
|
-
|
52
|
-
def run_qkv_lora(
|
53
|
-
self,
|
54
|
-
x: torch.Tensor,
|
55
|
-
qkv_lora_a: torch.Tensor,
|
56
|
-
qkv_lora_b: Tuple[torch.Tensor],
|
57
|
-
*args,
|
58
|
-
**kwargs,
|
59
|
-
) -> torch.Tensor:
|
60
|
-
|
61
|
-
assert isinstance(qkv_lora_b, tuple) and len(qkv_lora_b) == 2
|
62
|
-
|
63
|
-
# Shape of lora_a_output: (s, 3 * r)
|
64
|
-
lora_a_output = self.run_lora_a_sgemm(x=x, weights=qkv_lora_a)
|
65
|
-
|
66
|
-
q_lora_b, kv_lora_b = qkv_lora_b
|
67
|
-
lora_rank = kv_lora_b.shape[-1]
|
68
|
-
output_dim_q = q_lora_b.shape[-2]
|
69
|
-
output_dim_kv = kv_lora_b.shape[-2]
|
70
|
-
lora_output = torch.empty(
|
71
|
-
(x.shape[0], output_dim_q + 2 * output_dim_kv),
|
72
|
-
device=x.device,
|
73
|
-
dtype=x.dtype,
|
74
|
-
)
|
75
|
-
|
76
|
-
# q
|
77
|
-
lora_output[:, :output_dim_q] = self.run_lora_b_sgemm(
|
78
|
-
x=lora_a_output[:, :lora_rank].contiguous(), weights=q_lora_b[0]
|
79
|
-
)
|
80
|
-
|
81
|
-
# kv
|
82
|
-
lora_output[:, output_dim_q : output_dim_q + output_dim_kv] = (
|
83
|
-
self.run_lora_b_sgemm(
|
84
|
-
x=lora_a_output[:, lora_rank : 2 * lora_rank].contiguous(),
|
85
|
-
weights=kv_lora_b[0],
|
86
|
-
)
|
87
|
-
)
|
88
|
-
|
89
|
-
lora_output[
|
90
|
-
:, output_dim_q + output_dim_kv : output_dim_q + 2 * output_dim_kv
|
91
|
-
] = self.run_lora_b_sgemm(
|
92
|
-
x=lora_a_output[:, 2 * lora_rank : 3 * lora_rank].contiguous(),
|
93
|
-
weights=kv_lora_b[1],
|
94
|
-
)
|
95
|
-
|
96
|
-
return lora_output * self.batch_info.scalings[0]
|
97
|
-
|
98
|
-
def run_gate_up_lora(
|
99
|
-
self,
|
100
|
-
x: torch.Tensor,
|
101
|
-
gate_up_lora_a: torch.Tensor,
|
102
|
-
gate_up_lora_b: Tuple[torch.Tensor],
|
103
|
-
*args,
|
104
|
-
**kwargs,
|
105
|
-
) -> torch.Tensor:
|
106
|
-
|
107
|
-
assert isinstance(gate_up_lora_b, tuple) and len(gate_up_lora_b) == 2
|
108
|
-
lora_rank = gate_up_lora_b[0].shape[-1]
|
109
|
-
output_dim = gate_up_lora_b[0].shape[-2]
|
110
|
-
|
111
|
-
# Shape of lora_a_output: (s, 2 * r)
|
112
|
-
lora_a_output = self.run_lora_a_sgemm(x=x, weights=gate_up_lora_a)
|
113
|
-
|
114
|
-
lora_output = torch.empty(
|
115
|
-
(x.shape[0], 2 * output_dim),
|
116
|
-
device=x.device,
|
117
|
-
dtype=x.dtype,
|
118
|
-
)
|
119
|
-
|
120
|
-
# Compute lora for gate and up proj respectively
|
121
|
-
lora_output[:, :output_dim] = self.run_lora_b_sgemm(
|
122
|
-
x=lora_a_output[:, :lora_rank].contiguous(),
|
123
|
-
weights=gate_up_lora_b[0],
|
124
|
-
)
|
125
|
-
|
126
|
-
lora_output[:, output_dim:] = self.run_lora_b_sgemm(
|
127
|
-
x=lora_a_output[:, lora_rank:].contiguous(),
|
128
|
-
weights=gate_up_lora_b[1],
|
129
|
-
)
|
130
|
-
|
131
|
-
return lora_output * self.batch_info.scalings[0]
|
/sglang/{api.py → lang/api.py}
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|