sglang 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. sglang/api.py +7 -1
  2. sglang/bench_latency.py +3 -2
  3. sglang/global_config.py +1 -1
  4. sglang/lang/backend/runtime_endpoint.py +60 -49
  5. sglang/lang/interpreter.py +4 -2
  6. sglang/lang/ir.py +13 -4
  7. sglang/srt/constrained/jump_forward.py +13 -2
  8. sglang/srt/layers/activation.py +0 -1
  9. sglang/srt/layers/extend_attention.py +3 -1
  10. sglang/srt/layers/fused_moe/__init__.py +1 -0
  11. sglang/srt/layers/{fused_moe.py → fused_moe/fused_moe.py} +165 -108
  12. sglang/srt/layers/fused_moe/layer.py +587 -0
  13. sglang/srt/layers/logits_processor.py +4 -4
  14. sglang/srt/layers/radix_attention.py +38 -14
  15. sglang/srt/managers/schedule_batch.py +9 -14
  16. sglang/srt/managers/tokenizer_manager.py +1 -1
  17. sglang/srt/managers/tp_worker.py +1 -7
  18. sglang/srt/model_executor/cuda_graph_runner.py +48 -17
  19. sglang/srt/model_executor/forward_batch_info.py +132 -58
  20. sglang/srt/model_executor/model_runner.py +61 -28
  21. sglang/srt/models/chatglm.py +2 -2
  22. sglang/srt/models/commandr.py +1 -1
  23. sglang/srt/models/deepseek.py +2 -2
  24. sglang/srt/models/deepseek_v2.py +7 -6
  25. sglang/srt/models/gemma.py +1 -1
  26. sglang/srt/models/gemma2.py +11 -5
  27. sglang/srt/models/grok.py +50 -396
  28. sglang/srt/models/minicpm.py +2 -2
  29. sglang/srt/models/mixtral.py +56 -254
  30. sglang/srt/models/mixtral_quant.py +1 -4
  31. sglang/srt/models/qwen.py +2 -2
  32. sglang/srt/models/qwen2.py +2 -2
  33. sglang/srt/models/qwen2_moe.py +2 -2
  34. sglang/srt/models/stablelm.py +1 -1
  35. sglang/srt/openai_api/adapter.py +32 -21
  36. sglang/srt/sampling_params.py +0 -4
  37. sglang/srt/server.py +23 -15
  38. sglang/srt/server_args.py +7 -1
  39. sglang/srt/utils.py +1 -2
  40. sglang/test/runners.py +18 -10
  41. sglang/test/test_programs.py +32 -5
  42. sglang/test/test_utils.py +5 -1
  43. sglang/version.py +1 -1
  44. {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/METADATA +12 -4
  45. {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/RECORD +48 -48
  46. {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/WHEEL +1 -1
  47. sglang/srt/model_loader/model_loader.py +0 -292
  48. sglang/srt/model_loader/utils.py +0 -275
  49. {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/LICENSE +0 -0
  50. {sglang-0.2.12.dist-info → sglang-0.2.13.dist-info}/top_level.txt +0 -0
@@ -24,8 +24,6 @@ from torch import nn
24
24
  from torch.nn import LayerNorm
25
25
  from vllm.config import CacheConfig
26
26
  from vllm.distributed import get_tensor_model_parallel_world_size
27
- from vllm.model_executor.layers.activation import SiluAndMul
28
- from vllm.model_executor.layers.layernorm import RMSNorm
29
27
  from vllm.model_executor.layers.linear import (
30
28
  MergedColumnParallelLinear,
31
29
  QKVParallelLinear,
@@ -43,6 +41,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
43
41
  from vllm.sequence import SamplerOutput
44
42
  from vllm.transformers_utils.configs import ChatGLMConfig
45
43
 
44
+ from sglang.srt.layers.activation import SiluAndMul
45
+ from sglang.srt.layers.layernorm import RMSNorm
46
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
48
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -50,7 +50,6 @@ from vllm.distributed import (
50
50
  get_tensor_model_parallel_rank,
51
51
  get_tensor_model_parallel_world_size,
52
52
  )
53
- from vllm.model_executor.layers.activation import SiluAndMul
54
53
  from vllm.model_executor.layers.linear import (
55
54
  MergedColumnParallelLinear,
56
55
  QKVParallelLinear,
@@ -62,6 +61,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmb
62
61
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
63
62
  from vllm.model_executor.utils import set_weight_attrs
64
63
 
64
+ from sglang.srt.layers.activation import SiluAndMul
65
65
  from sglang.srt.layers.logits_processor import LogitsProcessor
66
66
  from sglang.srt.layers.radix_attention import RadixAttention
67
67
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -27,9 +27,7 @@ from vllm.distributed import (
27
27
  get_tensor_model_parallel_world_size,
28
28
  tensor_model_parallel_all_reduce,
29
29
  )
30
- from vllm.model_executor.layers.activation import SiluAndMul
31
30
  from vllm.model_executor.layers.fused_moe import fused_moe
32
- from vllm.model_executor.layers.layernorm import RMSNorm
33
31
  from vllm.model_executor.layers.linear import (
34
32
  MergedColumnParallelLinear,
35
33
  QKVParallelLinear,
@@ -44,6 +42,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
44
42
  )
45
43
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
46
44
 
45
+ from sglang.srt.layers.activation import SiluAndMul
46
+ from sglang.srt.layers.layernorm import RMSNorm
47
47
  from sglang.srt.layers.logits_processor import LogitsProcessor
48
48
  from sglang.srt.layers.radix_attention import RadixAttention
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -26,9 +26,7 @@ from vllm.distributed import (
26
26
  get_tensor_model_parallel_world_size,
27
27
  tensor_model_parallel_all_reduce,
28
28
  )
29
- from vllm.model_executor.layers.activation import SiluAndMul
30
29
  from vllm.model_executor.layers.fused_moe import FusedMoE
31
- from vllm.model_executor.layers.layernorm import RMSNorm
32
30
  from vllm.model_executor.layers.linear import (
33
31
  ColumnParallelLinear,
34
32
  MergedColumnParallelLinear,
@@ -43,6 +41,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
43
41
  )
44
42
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
45
43
 
44
+ from sglang.srt.layers.activation import SiluAndMul
45
+ from sglang.srt.layers.layernorm import RMSNorm
46
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
48
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -445,11 +445,12 @@ class DeepseekV2AttentionMLA(nn.Module):
445
445
  q_nope_out = q_input[..., : self.kv_lora_rank]
446
446
  torch.bmm(q_nope.transpose(0, 1), self.w_kc, out=q_nope_out.transpose(0, 1))
447
447
 
448
- k_input = self.kv_a_proj_with_mqa(hidden_states)[0].unsqueeze(1)
449
- k_pe = k_input[..., self.kv_lora_rank :]
450
- v_input = k_input[..., : self.kv_lora_rank]
451
- v_input = self.kv_a_layernorm(v_input.contiguous())
448
+ latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
449
+ v_input = latent_cache[..., : self.kv_lora_rank]
450
+ v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1)
451
+ k_input = latent_cache.unsqueeze(1)
452
452
  k_input[..., : self.kv_lora_rank] = v_input
453
+ k_pe = k_input[..., self.kv_lora_rank :]
453
454
 
454
455
  q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
455
456
  q_input[..., self.kv_lora_rank :] = q_pe
@@ -24,7 +24,6 @@ from transformers import PretrainedConfig
24
24
  from vllm.config import CacheConfig, LoRAConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
26
  from vllm.model_executor.layers.activation import GeluAndMul
27
- from vllm.model_executor.layers.layernorm import RMSNorm
28
27
  from vllm.model_executor.layers.linear import (
29
28
  MergedColumnParallelLinear,
30
29
  QKVParallelLinear,
@@ -35,6 +34,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope
35
34
  from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
36
35
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
36
 
37
+ from sglang.srt.layers.layernorm import RMSNorm
38
38
  from sglang.srt.layers.logits_processor import LogitsProcessor
39
39
  from sglang.srt.layers.radix_attention import RadixAttention
40
40
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -44,6 +44,12 @@ from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
45
45
 
46
46
 
47
+ # Aligned with HF's implementation, using sliding window inclusive with the last token
48
+ # SGLang assumes exclusive
49
+ def get_window_size(config):
50
+ return config.sliding_window - 1
51
+
52
+
47
53
  class GemmaRMSNorm(CustomOp):
48
54
  """RMS normalization for Gemma.
49
55
 
@@ -200,17 +206,14 @@ class Gemma2Attention(nn.Module):
200
206
  dtype=torch.get_default_dtype(),
201
207
  )
202
208
 
203
- # from vLLM: FIXME(woosuk): While Gemma 2 uses sliding window attention for every
204
- # odd layer, vLLM currently ignores it and uses global attention for
205
- # all layers.
206
- use_sliding_window = layer_idx % 2 == 1 and config.sliding_window is not None
207
- del use_sliding_window # Unused.
209
+ use_sliding_window = layer_idx % 2 == 0 and hasattr(config, "sliding_window")
208
210
  self.attn = RadixAttention(
209
211
  self.num_heads,
210
212
  self.head_dim,
211
213
  self.scaling,
212
214
  num_kv_heads=self.num_kv_heads,
213
215
  layer_id=layer_idx,
216
+ sliding_window_size=get_window_size(config) if use_sliding_window else None,
214
217
  logit_cap=self.config.attn_logit_softcapping,
215
218
  )
216
219
 
@@ -403,6 +406,9 @@ class Gemma2ForCausalLM(nn.Module):
403
406
  input_ids, hidden_states, self.model.embed_tokens.weight, input_metadata
404
407
  )
405
408
 
409
+ def get_window_size(self):
410
+ return get_window_size(self.config)
411
+
406
412
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
407
413
  stacked_params_mapping = [
408
414
  # (param_name, shard_name, shard_id)