sglang 0.3.1.post1__py3-none-any.whl → 0.3.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sglang/bench_latency.py +11 -2
  2. sglang/bench_server_latency.py +187 -0
  3. sglang/bench_serving.py +1 -1
  4. sglang/srt/layers/activation.py +8 -4
  5. sglang/srt/layers/attention_backend.py +3 -1
  6. sglang/srt/layers/layernorm.py +10 -7
  7. sglang/srt/layers/linear.py +1133 -0
  8. sglang/srt/layers/quantization/__init__.py +76 -0
  9. sglang/srt/layers/quantization/base_config.py +122 -0
  10. sglang/srt/layers/sampler.py +9 -2
  11. sglang/srt/managers/io_struct.py +3 -0
  12. sglang/srt/managers/policy_scheduler.py +49 -93
  13. sglang/srt/managers/schedule_batch.py +1 -1
  14. sglang/srt/managers/tp_worker.py +11 -6
  15. sglang/srt/model_executor/cuda_graph_runner.py +15 -14
  16. sglang/srt/model_executor/model_runner.py +13 -5
  17. sglang/srt/models/baichuan.py +1 -1
  18. sglang/srt/models/chatglm.py +6 -6
  19. sglang/srt/models/commandr.py +7 -7
  20. sglang/srt/models/dbrx.py +7 -7
  21. sglang/srt/models/deepseek.py +7 -7
  22. sglang/srt/models/deepseek_v2.py +9 -9
  23. sglang/srt/models/exaone.py +6 -6
  24. sglang/srt/models/gemma.py +6 -6
  25. sglang/srt/models/gemma2.py +6 -6
  26. sglang/srt/models/gpt_bigcode.py +6 -6
  27. sglang/srt/models/grok.py +6 -6
  28. sglang/srt/models/internlm2.py +6 -6
  29. sglang/srt/models/llama.py +7 -9
  30. sglang/srt/models/llama_classification.py +3 -4
  31. sglang/srt/models/llava.py +1 -1
  32. sglang/srt/models/llavavid.py +1 -1
  33. sglang/srt/models/minicpm.py +6 -6
  34. sglang/srt/models/minicpm3.py +3 -3
  35. sglang/srt/models/mixtral.py +6 -6
  36. sglang/srt/models/mixtral_quant.py +6 -6
  37. sglang/srt/models/olmoe.py +1 -1
  38. sglang/srt/models/qwen.py +6 -6
  39. sglang/srt/models/qwen2.py +6 -6
  40. sglang/srt/models/qwen2_moe.py +7 -7
  41. sglang/srt/models/stablelm.py +6 -6
  42. sglang/srt/models/xverse.py +2 -4
  43. sglang/srt/models/xverse_moe.py +2 -5
  44. sglang/srt/models/yivl.py +1 -1
  45. sglang/srt/server_args.py +17 -21
  46. sglang/srt/utils.py +21 -1
  47. sglang/test/few_shot_gsm8k.py +8 -2
  48. sglang/test/test_utils.py +5 -2
  49. sglang/version.py +1 -1
  50. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/METADATA +5 -5
  51. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/RECORD +54 -50
  52. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/LICENSE +0 -0
  53. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/WHEEL +0 -0
  54. {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/top_level.txt +0 -0
sglang/srt/models/dbrx.py CHANGED
@@ -27,12 +27,6 @@ from vllm.distributed import (
27
27
  tensor_model_parallel_all_reduce,
28
28
  )
29
29
  from vllm.model_executor.layers.fused_moe import fused_moe
30
- from vllm.model_executor.layers.linear import (
31
- QKVParallelLinear,
32
- ReplicatedLinear,
33
- RowParallelLinear,
34
- )
35
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
36
30
  from vllm.model_executor.layers.rotary_embedding import get_rope
37
31
  from vllm.model_executor.layers.vocab_parallel_embedding import (
38
32
  DEFAULT_VOCAB_PADDING_SIZE,
@@ -40,12 +34,18 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
40
34
  VocabParallelEmbedding,
41
35
  )
42
36
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
43
- from vllm.model_executor.utils import set_weight_attrs
44
37
  from vllm.transformers_utils.configs.dbrx import DbrxConfig
45
38
 
39
+ from sglang.srt.layers.linear import (
40
+ QKVParallelLinear,
41
+ ReplicatedLinear,
42
+ RowParallelLinear,
43
+ )
46
44
  from sglang.srt.layers.logits_processor import LogitsProcessor
45
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
47
46
  from sglang.srt.layers.radix_attention import RadixAttention
48
47
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
48
+ from sglang.srt.utils import set_weight_attrs
49
49
 
50
50
 
51
51
  class DbrxRouter(nn.Module):
@@ -28,13 +28,6 @@ from vllm.distributed import (
28
28
  tensor_model_parallel_all_reduce,
29
29
  )
30
30
  from vllm.model_executor.layers.fused_moe import fused_moe
31
- from vllm.model_executor.layers.linear import (
32
- MergedColumnParallelLinear,
33
- QKVParallelLinear,
34
- ReplicatedLinear,
35
- RowParallelLinear,
36
- )
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
31
  from vllm.model_executor.layers.rotary_embedding import get_rope
39
32
  from vllm.model_executor.layers.vocab_parallel_embedding import (
40
33
  ParallelLMHead,
@@ -44,7 +37,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
37
 
45
38
  from sglang.srt.layers.activation import SiluAndMul
46
39
  from sglang.srt.layers.layernorm import RMSNorm
40
+ from sglang.srt.layers.linear import (
41
+ MergedColumnParallelLinear,
42
+ QKVParallelLinear,
43
+ ReplicatedLinear,
44
+ RowParallelLinear,
45
+ )
47
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
48
48
  from sglang.srt.layers.radix_attention import RadixAttention
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
50
50
 
@@ -27,13 +27,6 @@ from vllm.distributed import (
27
27
  tensor_model_parallel_all_reduce,
28
28
  )
29
29
  from vllm.model_executor.layers.fused_moe import FusedMoE
30
- from vllm.model_executor.layers.linear import (
31
- ColumnParallelLinear,
32
- MergedColumnParallelLinear,
33
- ReplicatedLinear,
34
- RowParallelLinear,
35
- )
36
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
37
30
  from vllm.model_executor.layers.rotary_embedding import get_rope
38
31
  from vllm.model_executor.layers.vocab_parallel_embedding import (
39
32
  ParallelLMHead,
@@ -43,7 +36,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
43
36
 
44
37
  from sglang.srt.layers.activation import SiluAndMul
45
38
  from sglang.srt.layers.layernorm import RMSNorm
39
+ from sglang.srt.layers.linear import (
40
+ ColumnParallelLinear,
41
+ MergedColumnParallelLinear,
42
+ ReplicatedLinear,
43
+ RowParallelLinear,
44
+ )
46
45
  from sglang.srt.layers.logits_processor import LogitsProcessor
46
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
48
  from sglang.srt.managers.schedule_batch import global_server_args_dict
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -507,7 +507,7 @@ class DeepseekV2DecoderLayer(nn.Module):
507
507
  rope_theta = getattr(config, "rope_theta", 10000)
508
508
  rope_scaling = getattr(config, "rope_scaling", None)
509
509
  max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
510
- if global_server_args_dict["enable_mla"]:
510
+ if not global_server_args_dict["disable_mla"]:
511
511
  self.self_attn = DeepseekV2AttentionMLA(
512
512
  config=config,
513
513
  hidden_size=self.hidden_size,
@@ -732,7 +732,7 @@ class DeepseekV2ForCausalLM(nn.Module):
732
732
  )
733
733
  weight_loader(param, loaded_weight)
734
734
 
735
- if global_server_args_dict["enable_mla"]:
735
+ if not global_server_args_dict["disable_mla"]:
736
736
  for layer_id in range(self.config.num_hidden_layers):
737
737
  self_attn = self.model.layers[layer_id].self_attn
738
738
  w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
@@ -23,12 +23,6 @@ import torch
23
23
  from torch import nn
24
24
  from vllm.config import CacheConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- MergedColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import (
34
28
  ParallelLMHead,
@@ -38,7 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
32
 
39
33
  from sglang.srt.layers.activation import SiluAndMul
40
34
  from sglang.srt.layers.layernorm import RMSNorm
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
41
40
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -23,19 +23,19 @@ from torch import nn
23
23
  from transformers import PretrainedConfig
24
24
  from vllm.config import CacheConfig, LoRAConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- MergedColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
34
28
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
35
29
 
36
30
  from sglang.srt.layers.activation import GeluAndMul
37
31
  from sglang.srt.layers.layernorm import RMSNorm
32
+ from sglang.srt.layers.linear import (
33
+ MergedColumnParallelLinear,
34
+ QKVParallelLinear,
35
+ RowParallelLinear,
36
+ )
38
37
  from sglang.srt.layers.logits_processor import LogitsProcessor
38
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
39
39
  from sglang.srt.layers.radix_attention import RadixAttention
40
40
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
41
41
 
@@ -22,12 +22,6 @@ from torch import nn
22
22
  from transformers import PretrainedConfig
23
23
  from vllm.config import CacheConfig, LoRAConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
 
32
26
  # from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
@@ -35,7 +29,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
35
29
 
36
30
  from sglang.srt.layers.activation import GeluAndMul
37
31
  from sglang.srt.layers.layernorm import GemmaRMSNorm
32
+ from sglang.srt.layers.linear import (
33
+ MergedColumnParallelLinear,
34
+ QKVParallelLinear,
35
+ RowParallelLinear,
36
+ )
38
37
  from sglang.srt.layers.logits_processor import LogitsProcessor
38
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
39
39
  from sglang.srt.layers.radix_attention import RadixAttention
40
40
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
41
41
 
@@ -23,17 +23,17 @@ from torch import nn
23
23
  from transformers import GPTBigCodeConfig
24
24
  from vllm.config import CacheConfig, LoRAConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- ColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
33
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
34
28
 
35
29
  from sglang.srt.layers.activation import get_act_fn
30
+ from sglang.srt.layers.linear import (
31
+ ColumnParallelLinear,
32
+ QKVParallelLinear,
33
+ RowParallelLinear,
34
+ )
36
35
  from sglang.srt.layers.logits_processor import LogitsProcessor
36
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
37
37
  from sglang.srt.layers.radix_attention import RadixAttention
38
38
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
39
39
 
sglang/srt/models/grok.py CHANGED
@@ -28,12 +28,6 @@ from vllm.distributed import (
28
28
  get_tensor_model_parallel_rank,
29
29
  get_tensor_model_parallel_world_size,
30
30
  )
31
- from vllm.model_executor.layers.linear import (
32
- QKVParallelLinear,
33
- ReplicatedLinear,
34
- RowParallelLinear,
35
- )
36
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
37
31
  from vllm.model_executor.layers.rotary_embedding import get_rope
38
32
  from vllm.model_executor.layers.vocab_parallel_embedding import (
39
33
  ParallelLMHead,
@@ -44,7 +38,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
38
 
45
39
  from sglang.srt.layers.fused_moe import FusedMoE
46
40
  from sglang.srt.layers.layernorm import RMSNorm
41
+ from sglang.srt.layers.linear import (
42
+ QKVParallelLinear,
43
+ ReplicatedLinear,
44
+ RowParallelLinear,
45
+ )
47
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
48
48
  from sglang.srt.layers.radix_attention import RadixAttention
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
50
50
 
@@ -23,12 +23,6 @@ from torch import nn
23
23
  from transformers import PretrainedConfig
24
24
  from vllm.config import CacheConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- MergedColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import (
34
28
  ParallelLMHead,
@@ -38,7 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
32
 
39
33
  from sglang.srt.layers.activation import SiluAndMul
40
34
  from sglang.srt.layers.layernorm import RMSNorm
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
41
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -24,12 +24,6 @@ from torch import nn
24
24
  from transformers import LlamaConfig
25
25
  from vllm.config import CacheConfig
26
26
  from vllm.distributed import get_tensor_model_parallel_world_size
27
- from vllm.model_executor.layers.linear import (
28
- MergedColumnParallelLinear,
29
- QKVParallelLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  ParallelLMHead,
@@ -39,7 +33,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
33
 
40
34
  from sglang.srt.layers.activation import SiluAndMul
41
35
  from sglang.srt.layers.layernorm import RMSNorm
36
+ from sglang.srt.layers.linear import (
37
+ MergedColumnParallelLinear,
38
+ QKVParallelLinear,
39
+ RowParallelLinear,
40
+ )
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
45
45
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -305,8 +305,6 @@ class LlamaForCausalLM(nn.Module):
305
305
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
306
306
  self.logits_processor = LogitsProcessor(config)
307
307
 
308
- self.param_dict = dict(self.named_parameters())
309
-
310
308
  @torch.no_grad()
311
309
  def forward(
312
310
  self,
@@ -374,7 +372,7 @@ class LlamaForCausalLM(nn.Module):
374
372
  (".gate_up_proj", ".gate_proj", 0),
375
373
  (".gate_up_proj", ".up_proj", 1),
376
374
  ]
377
- params_dict = self.param_dict
375
+ params_dict = dict(self.named_parameters())
378
376
 
379
377
  for name, loaded_weight in weights:
380
378
  if "rotary_emb.inv_freq" in name or "projector" in name:
@@ -19,10 +19,10 @@ import torch
19
19
  from torch import nn
20
20
  from transformers import LlamaConfig
21
21
  from vllm.config import CacheConfig
22
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
23
22
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
24
23
 
25
24
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
25
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
26
26
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
27
27
  from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
28
28
 
@@ -36,6 +36,7 @@ class LlamaForClassification(nn.Module):
36
36
  ) -> None:
37
37
  super().__init__()
38
38
  self.config = config
39
+ self.torchao_config = None
39
40
  self.quant_config = quant_config
40
41
  self.model = LlamaModel(config, quant_config=quant_config)
41
42
 
@@ -44,8 +45,6 @@ class LlamaForClassification(nn.Module):
44
45
  )
45
46
  self.eos_token_id = config.eos_token_id
46
47
 
47
- self.param_dict = dict(self.named_parameters())
48
-
49
48
  @torch.no_grad()
50
49
  def forward(
51
50
  self,
@@ -77,7 +76,7 @@ class LlamaForClassification(nn.Module):
77
76
  return logits_output
78
77
 
79
78
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
80
- params_dict = self.param_dict
79
+ params_dict = dict(self.named_parameters())
81
80
 
82
81
  for name, loaded_weight in weights:
83
82
  if "classification_head" in name:
@@ -32,9 +32,9 @@ from transformers import (
32
32
  )
33
33
  from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
34
34
  from vllm.config import CacheConfig
35
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
36
35
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
36
 
37
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
38
38
  from sglang.srt.mm_utils import (
39
39
  get_anyres_image_grid_shape,
40
40
  unpad_image,
@@ -23,9 +23,9 @@ from torch import nn
23
23
  from transformers import CLIPVisionModel, LlavaConfig
24
24
  from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
25
25
  from vllm.config import CacheConfig
26
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
27
26
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
28
27
 
28
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
29
29
  from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
30
30
  from sglang.srt.models.llama import LlamaForCausalLM
31
31
 
@@ -22,12 +22,6 @@ import torch
22
22
  from torch import nn
23
23
  from vllm.config import CacheConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import (
33
27
  ParallelLMHead,
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
31
 
38
32
  from sglang.srt.layers.activation import SiluAndMul
39
33
  from sglang.srt.layers.layernorm import RMSNorm
34
+ from sglang.srt.layers.linear import (
35
+ MergedColumnParallelLinear,
36
+ QKVParallelLinear,
37
+ RowParallelLinear,
38
+ )
40
39
  from sglang.srt.layers.logits_processor import LogitsProcessor
40
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
42
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
43
 
@@ -29,7 +29,6 @@ from vllm.model_executor.layers.linear import (
29
29
  ReplicatedLinear,
30
30
  RowParallelLinear,
31
31
  )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
32
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
33
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
34
  ParallelLMHead,
@@ -40,6 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
39
  from sglang.srt.layers.activation import SiluAndMul
41
40
  from sglang.srt.layers.layernorm import RMSNorm
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.managers.schedule_batch import global_server_args_dict
45
45
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -419,7 +419,7 @@ class MiniCPM3DecoderLayer(nn.Module):
419
419
  rope_theta = getattr(config, "rope_theta", 10000)
420
420
  rope_scaling = getattr(config, "rope_scaling", None)
421
421
  max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
422
- if global_server_args_dict["enable_mla"]:
422
+ if not global_server_args_dict["disable_mla"]:
423
423
  self.self_attn = MiniCPM3AttentionMLA(
424
424
  config=config,
425
425
  hidden_size=self.hidden_size,
@@ -653,7 +653,7 @@ class MiniCPM3ForCausalLM(nn.Module):
653
653
  )
654
654
  weight_loader(param, loaded_weight)
655
655
 
656
- if global_server_args_dict["enable_mla"]:
656
+ if not global_server_args_dict["disable_mla"]:
657
657
  for layer_id in range(self.config.num_hidden_layers):
658
658
  self_attn = self.model.layers[layer_id].self_attn
659
659
  w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
@@ -24,12 +24,6 @@ from transformers import MixtralConfig
24
24
  from vllm.config import CacheConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
26
  from vllm.model_executor.layers.fused_moe import FusedMoE
27
- from vllm.model_executor.layers.linear import (
28
- QKVParallelLinear,
29
- ReplicatedLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  DEFAULT_VOCAB_PADDING_SIZE,
@@ -39,7 +33,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
39
33
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
34
 
41
35
  from sglang.srt.layers.layernorm import RMSNorm
36
+ from sglang.srt.layers.linear import (
37
+ QKVParallelLinear,
38
+ ReplicatedLinear,
39
+ RowParallelLinear,
40
+ )
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
45
45
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -29,12 +29,6 @@ from vllm.distributed import (
29
29
  get_tensor_model_parallel_world_size,
30
30
  tensor_model_parallel_all_reduce,
31
31
  )
32
- from vllm.model_executor.layers.linear import (
33
- QKVParallelLinear,
34
- ReplicatedLinear,
35
- RowParallelLinear,
36
- )
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
32
  from vllm.model_executor.layers.rotary_embedding import get_rope
39
33
  from vllm.model_executor.layers.vocab_parallel_embedding import (
40
34
  ParallelLMHead,
@@ -43,7 +37,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
43
37
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
38
 
45
39
  from sglang.srt.layers.layernorm import RMSNorm
40
+ from sglang.srt.layers.linear import (
41
+ QKVParallelLinear,
42
+ ReplicatedLinear,
43
+ RowParallelLinear,
44
+ )
46
45
  from sglang.srt.layers.logits_processor import LogitsProcessor
46
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
48
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
49
49
 
@@ -35,7 +35,6 @@ from vllm.model_executor.layers.linear import (
35
35
  ReplicatedLinear,
36
36
  RowParallelLinear,
37
37
  )
38
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
39
38
  from vllm.model_executor.layers.rotary_embedding import get_rope
40
39
  from vllm.model_executor.layers.vocab_parallel_embedding import (
41
40
  ParallelLMHead,
@@ -47,6 +46,7 @@ from vllm.utils import print_warning_once
47
46
  from sglang.srt.layers.activation import SiluAndMul
48
47
  from sglang.srt.layers.layernorm import RMSNorm
49
48
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
49
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
50
50
  from sglang.srt.layers.radix_attention import RadixAttention
51
51
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
52
52
 
sglang/srt/models/qwen.py CHANGED
@@ -22,12 +22,6 @@ from torch import nn
22
22
  from transformers import PretrainedConfig
23
23
  from vllm.config import CacheConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import (
33
27
  ParallelLMHead,
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
31
 
38
32
  from sglang.srt.layers.activation import SiluAndMul
39
33
  from sglang.srt.layers.layernorm import RMSNorm
34
+ from sglang.srt.layers.linear import (
35
+ MergedColumnParallelLinear,
36
+ QKVParallelLinear,
37
+ RowParallelLinear,
38
+ )
40
39
  from sglang.srt.layers.logits_processor import LogitsProcessor
40
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
42
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
43
 
@@ -22,12 +22,6 @@ import torch
22
22
  from torch import nn
23
23
  from vllm.config import CacheConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import (
33
27
  ParallelLMHead,
@@ -37,8 +31,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
31
 
38
32
  from sglang.srt.layers.activation import SiluAndMul
39
33
  from sglang.srt.layers.layernorm import RMSNorm
34
+ from sglang.srt.layers.linear import (
35
+ MergedColumnParallelLinear,
36
+ QKVParallelLinear,
37
+ RowParallelLinear,
38
+ )
40
39
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
40
  from sglang.srt.layers.pooler import Pooler, PoolingType
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -29,13 +29,6 @@ from vllm.distributed import (
29
29
  tensor_model_parallel_all_reduce,
30
30
  )
31
31
  from vllm.model_executor.layers.fused_moe import FusedMoE
32
- from vllm.model_executor.layers.linear import (
33
- MergedColumnParallelLinear,
34
- QKVParallelLinear,
35
- ReplicatedLinear,
36
- RowParallelLinear,
37
- )
38
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
39
32
  from vllm.model_executor.layers.rotary_embedding import get_rope
40
33
  from vllm.model_executor.layers.vocab_parallel_embedding import (
41
34
  ParallelLMHead,
@@ -45,7 +38,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
45
38
 
46
39
  from sglang.srt.layers.activation import SiluAndMul
47
40
  from sglang.srt.layers.layernorm import RMSNorm
41
+ from sglang.srt.layers.linear import (
42
+ MergedColumnParallelLinear,
43
+ QKVParallelLinear,
44
+ ReplicatedLinear,
45
+ RowParallelLinear,
46
+ )
48
47
  from sglang.srt.layers.logits_processor import LogitsProcessor
48
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
49
49
  from sglang.srt.layers.radix_attention import RadixAttention
50
50
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
51
51
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -24,12 +24,6 @@ from torch import nn
24
24
  from transformers import PretrainedConfig
25
25
  from vllm.config import CacheConfig
26
26
  from vllm.distributed import get_tensor_model_parallel_world_size
27
- from vllm.model_executor.layers.linear import (
28
- MergedColumnParallelLinear,
29
- QKVParallelLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  ParallelLMHead,
@@ -38,7 +32,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
38
32
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
33
 
40
34
  from sglang.srt.layers.activation import SiluAndMul
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
41
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -31,7 +31,6 @@ from vllm.model_executor.layers.linear import (
31
31
  QKVParallelLinear,
32
32
  RowParallelLinear,
33
33
  )
34
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
35
34
  from vllm.model_executor.layers.rotary_embedding import get_rope
36
35
  from vllm.model_executor.layers.vocab_parallel_embedding import (
37
36
  ParallelLMHead,
@@ -40,6 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
40
39
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
41
40
 
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.model_executor.model_runner import InputMetadata
45
45
 
@@ -307,8 +307,6 @@ class XverseForCausalLM(nn.Module):
307
307
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
308
308
  self.logits_processor = LogitsProcessor(config)
309
309
 
310
- self.param_dict = dict(self.named_parameters())
311
-
312
310
  @torch.no_grad()
313
311
  def forward(
314
312
  self,
@@ -333,7 +331,7 @@ class XverseForCausalLM(nn.Module):
333
331
  ("gate_up_proj", "gate_proj", 0),
334
332
  ("gate_up_proj", "up_proj", 1),
335
333
  ]
336
- params_dict = self.param_dict
334
+ params_dict = dict(self.named_parameters())
337
335
 
338
336
  def load_weights_per_param(name, loaded_weight):
339
337
  if "rotary_emb.inv_freq" in name or "projector" in name: