sglang 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +7 -1
- sglang/bench_latency.py +3 -2
- sglang/global_config.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +60 -49
- sglang/lang/interpreter.py +4 -2
- sglang/lang/ir.py +13 -4
- sglang/srt/constrained/jump_forward.py +13 -2
- sglang/srt/layers/activation.py +0 -1
- sglang/srt/layers/extend_attention.py +3 -1
- sglang/srt/layers/fused_moe/__init__.py +1 -0
- sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
- sglang/srt/layers/fused_moe/layer.py +587 -0
- sglang/srt/layers/logits_processor.py +4 -4
- sglang/srt/layers/radix_attention.py +38 -14
- sglang/srt/managers/schedule_batch.py +9 -14
- sglang/srt/managers/tokenizer_manager.py +1 -1
- sglang/srt/managers/tp_worker.py +1 -7
- sglang/srt/model_executor/cuda_graph_runner.py +48 -17
- sglang/srt/model_executor/forward_batch_info.py +132 -58
- sglang/srt/model_executor/model_runner.py +61 -28
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +1 -1
- sglang/srt/models/deepseek.py +2 -2
- sglang/srt/models/deepseek_v2.py +7 -6
- sglang/srt/models/gemma.py +1 -1
- sglang/srt/models/gemma2.py +11 -5
- sglang/srt/models/grok.py +50 -396
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/mixtral.py +56 -254
- sglang/srt/models/mixtral_quant.py +1 -4
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +2 -2
- sglang/srt/models/qwen2_moe.py +2 -2
- sglang/srt/models/stablelm.py +1 -1
- sglang/srt/openai_api/adapter.py +32 -21
- sglang/srt/sampling_params.py +0 -4
- sglang/srt/server.py +23 -15
- sglang/srt/server_args.py +7 -1
- sglang/srt/utils.py +1 -2
- sglang/test/runners.py +18 -10
- sglang/test/test_programs.py +32 -5
- sglang/test/test_utils.py +5 -1
- sglang/version.py +1 -1
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/METADATA +12 -4
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/RECORD +48 -48
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/WHEEL +1 -1
- sglang/srt/model_loader/model_loader.py +0 -292
- sglang/srt/model_loader/utils.py +0 -275
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/LICENSE +0 -0
- {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/top_level.txt +0 -0
sglang/srt/models/chatglm.py
CHANGED
@@ -24,8 +24,6 @@ from torch import nn
|
|
24
24
|
from torch.nn import LayerNorm
|
25
25
|
from vllm.config import CacheConfig
|
26
26
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
27
|
-
from vllm.model_executor.layers.activation import SiluAndMul
|
28
|
-
from vllm.model_executor.layers.layernorm import RMSNorm
|
29
27
|
from vllm.model_executor.layers.linear import (
|
30
28
|
MergedColumnParallelLinear,
|
31
29
|
QKVParallelLinear,
|
@@ -43,6 +41,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
|
43
41
|
from vllm.sequence import SamplerOutput
|
44
42
|
from vllm.transformers_utils.configs import ChatGLMConfig
|
45
43
|
|
44
|
+
from sglang.srt.layers.activation import SiluAndMul
|
45
|
+
from sglang.srt.layers.layernorm import RMSNorm
|
46
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
47
47
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
48
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
sglang/srt/models/commandr.py
CHANGED
@@ -50,7 +50,6 @@ from vllm.distributed import (
|
|
50
50
|
get_tensor_model_parallel_rank,
|
51
51
|
get_tensor_model_parallel_world_size,
|
52
52
|
)
|
53
|
-
from vllm.model_executor.layers.activation import SiluAndMul
|
54
53
|
from vllm.model_executor.layers.linear import (
|
55
54
|
MergedColumnParallelLinear,
|
56
55
|
QKVParallelLinear,
|
@@ -62,6 +61,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmb
|
|
62
61
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
63
62
|
from vllm.model_executor.utils import set_weight_attrs
|
64
63
|
|
64
|
+
from sglang.srt.layers.activation import SiluAndMul
|
65
65
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
66
66
|
from sglang.srt.layers.radix_attention import RadixAttention
|
67
67
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
sglang/srt/models/deepseek.py
CHANGED
@@ -27,9 +27,7 @@ from vllm.distributed import (
|
|
27
27
|
get_tensor_model_parallel_world_size,
|
28
28
|
tensor_model_parallel_all_reduce,
|
29
29
|
)
|
30
|
-
from vllm.model_executor.layers.activation import SiluAndMul
|
31
30
|
from vllm.model_executor.layers.fused_moe import fused_moe
|
32
|
-
from vllm.model_executor.layers.layernorm import RMSNorm
|
33
31
|
from vllm.model_executor.layers.linear import (
|
34
32
|
MergedColumnParallelLinear,
|
35
33
|
QKVParallelLinear,
|
@@ -44,6 +42,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
44
42
|
)
|
45
43
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
46
44
|
|
45
|
+
from sglang.srt.layers.activation import SiluAndMul
|
46
|
+
from sglang.srt.layers.layernorm import RMSNorm
|
47
47
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
48
48
|
from sglang.srt.layers.radix_attention import RadixAttention
|
49
49
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -26,9 +26,7 @@ from vllm.distributed import (
|
|
26
26
|
get_tensor_model_parallel_world_size,
|
27
27
|
tensor_model_parallel_all_reduce,
|
28
28
|
)
|
29
|
-
from vllm.model_executor.layers.activation import SiluAndMul
|
30
29
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
31
|
-
from vllm.model_executor.layers.layernorm import RMSNorm
|
32
30
|
from vllm.model_executor.layers.linear import (
|
33
31
|
ColumnParallelLinear,
|
34
32
|
MergedColumnParallelLinear,
|
@@ -43,6 +41,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
43
41
|
)
|
44
42
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
45
43
|
|
44
|
+
from sglang.srt.layers.activation import SiluAndMul
|
45
|
+
from sglang.srt.layers.layernorm import RMSNorm
|
46
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
47
47
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
48
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
@@ -445,11 +445,12 @@ class DeepseekV2AttentionMLA(nn.Module):
|
|
445
445
|
q_nope_out = q_input[..., : self.kv_lora_rank]
|
446
446
|
torch.bmm(q_nope.transpose(0, 1), self.w_kc, out=q_nope_out.transpose(0, 1))
|
447
447
|
|
448
|
-
|
449
|
-
|
450
|
-
v_input =
|
451
|
-
|
448
|
+
latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
|
449
|
+
v_input = latent_cache[..., : self.kv_lora_rank]
|
450
|
+
v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1)
|
451
|
+
k_input = latent_cache.unsqueeze(1)
|
452
452
|
k_input[..., : self.kv_lora_rank] = v_input
|
453
|
+
k_pe = k_input[..., self.kv_lora_rank :]
|
453
454
|
|
454
455
|
q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
|
455
456
|
q_input[..., self.kv_lora_rank :] = q_pe
|
sglang/srt/models/gemma.py
CHANGED
@@ -24,7 +24,6 @@ from transformers import PretrainedConfig
|
|
24
24
|
from vllm.config import CacheConfig, LoRAConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
26
|
from vllm.model_executor.layers.activation import GeluAndMul
|
27
|
-
from vllm.model_executor.layers.layernorm import RMSNorm
|
28
27
|
from vllm.model_executor.layers.linear import (
|
29
28
|
MergedColumnParallelLinear,
|
30
29
|
QKVParallelLinear,
|
@@ -35,6 +34,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
|
|
35
34
|
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
36
35
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
37
36
|
|
37
|
+
from sglang.srt.layers.layernorm import RMSNorm
|
38
38
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
39
39
|
from sglang.srt.layers.radix_attention import RadixAttention
|
40
40
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
sglang/srt/models/gemma2.py
CHANGED
@@ -44,6 +44,12 @@ from sglang.srt.layers.radix_attention import RadixAttention
|
|
44
44
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
45
45
|
|
46
46
|
|
47
|
+
# Aligned with HF's implementation, using sliding window inclusive with the last token
|
48
|
+
# SGLang assumes exclusive
|
49
|
+
def get_window_size(config):
|
50
|
+
return config.sliding_window - 1
|
51
|
+
|
52
|
+
|
47
53
|
class GemmaRMSNorm(CustomOp):
|
48
54
|
"""RMS normalization for Gemma.
|
49
55
|
|
@@ -200,17 +206,14 @@ class Gemma2Attention(nn.Module):
|
|
200
206
|
dtype=torch.get_default_dtype(),
|
201
207
|
)
|
202
208
|
|
203
|
-
|
204
|
-
# odd layer, vLLM currently ignores it and uses global attention for
|
205
|
-
# all layers.
|
206
|
-
use_sliding_window = layer_idx % 2 == 1 and config.sliding_window is not None
|
207
|
-
del use_sliding_window # Unused.
|
209
|
+
use_sliding_window = layer_idx % 2 == 0 and hasattr(config, "sliding_window")
|
208
210
|
self.attn = RadixAttention(
|
209
211
|
self.num_heads,
|
210
212
|
self.head_dim,
|
211
213
|
self.scaling,
|
212
214
|
num_kv_heads=self.num_kv_heads,
|
213
215
|
layer_id=layer_idx,
|
216
|
+
sliding_window_size=get_window_size(config) if use_sliding_window else None,
|
214
217
|
logit_cap=self.config.attn_logit_softcapping,
|
215
218
|
)
|
216
219
|
|
@@ -403,6 +406,9 @@ class Gemma2ForCausalLM(nn.Module):
|
|
403
406
|
input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata
|
404
407
|
)
|
405
408
|
|
409
|
+
def get_window_size(self):
|
410
|
+
return get_window_size(self.config)
|
411
|
+
|
406
412
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
407
413
|
stacked_params_mapping = [
|
408
414
|
# (param_name, shard_name, shard_id)
|