sglang 0.3.1.post1__py3-none-any.whl → 0.3.1.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +11 -2
- sglang/bench_server_latency.py +187 -0
- sglang/bench_serving.py +1 -1
- sglang/srt/layers/activation.py +8 -4
- sglang/srt/layers/attention_backend.py +3 -1
- sglang/srt/layers/layernorm.py +10 -7
- sglang/srt/layers/linear.py +1133 -0
- sglang/srt/layers/quantization/__init__.py +76 -0
- sglang/srt/layers/quantization/base_config.py +122 -0
- sglang/srt/layers/sampler.py +9 -2
- sglang/srt/managers/io_struct.py +3 -0
- sglang/srt/managers/policy_scheduler.py +49 -93
- sglang/srt/managers/schedule_batch.py +1 -1
- sglang/srt/managers/tp_worker.py +11 -6
- sglang/srt/model_executor/cuda_graph_runner.py +15 -14
- sglang/srt/model_executor/model_runner.py +13 -5
- sglang/srt/models/baichuan.py +1 -1
- sglang/srt/models/chatglm.py +6 -6
- sglang/srt/models/commandr.py +7 -7
- sglang/srt/models/dbrx.py +7 -7
- sglang/srt/models/deepseek.py +7 -7
- sglang/srt/models/deepseek_v2.py +9 -9
- sglang/srt/models/exaone.py +6 -6
- sglang/srt/models/gemma.py +6 -6
- sglang/srt/models/gemma2.py +6 -6
- sglang/srt/models/gpt_bigcode.py +6 -6
- sglang/srt/models/grok.py +6 -6
- sglang/srt/models/internlm2.py +6 -6
- sglang/srt/models/llama.py +7 -9
- sglang/srt/models/llama_classification.py +3 -4
- sglang/srt/models/llava.py +1 -1
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +6 -6
- sglang/srt/models/minicpm3.py +3 -3
- sglang/srt/models/mixtral.py +6 -6
- sglang/srt/models/mixtral_quant.py +6 -6
- sglang/srt/models/olmoe.py +1 -1
- sglang/srt/models/qwen.py +6 -6
- sglang/srt/models/qwen2.py +6 -6
- sglang/srt/models/qwen2_moe.py +7 -7
- sglang/srt/models/stablelm.py +6 -6
- sglang/srt/models/xverse.py +2 -4
- sglang/srt/models/xverse_moe.py +2 -5
- sglang/srt/models/yivl.py +1 -1
- sglang/srt/server_args.py +17 -21
- sglang/srt/utils.py +21 -1
- sglang/test/few_shot_gsm8k.py +8 -2
- sglang/test/test_utils.py +5 -2
- sglang/version.py +1 -1
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/METADATA +5 -5
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/RECORD +54 -50
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/LICENSE +0 -0
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/WHEEL +0 -0
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post3.dist-info}/top_level.txt +0 -0
sglang/srt/models/dbrx.py
CHANGED
@@ -27,12 +27,6 @@ from vllm.distributed import (
|
|
27
27
|
tensor_model_parallel_all_reduce,
|
28
28
|
)
|
29
29
|
from vllm.model_executor.layers.fused_moe import fused_moe
|
30
|
-
from vllm.model_executor.layers.linear import (
|
31
|
-
QKVParallelLinear,
|
32
|
-
ReplicatedLinear,
|
33
|
-
RowParallelLinear,
|
34
|
-
)
|
35
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
36
30
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
37
31
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
38
32
|
DEFAULT_VOCAB_PADDING_SIZE,
|
@@ -40,12 +34,18 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
40
34
|
VocabParallelEmbedding,
|
41
35
|
)
|
42
36
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
43
|
-
from vllm.model_executor.utils import set_weight_attrs
|
44
37
|
from vllm.transformers_utils.configs.dbrx import DbrxConfig
|
45
38
|
|
39
|
+
from sglang.srt.layers.linear import (
|
40
|
+
QKVParallelLinear,
|
41
|
+
ReplicatedLinear,
|
42
|
+
RowParallelLinear,
|
43
|
+
)
|
46
44
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
45
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
47
46
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
47
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
48
|
+
from sglang.srt.utils import set_weight_attrs
|
49
49
|
|
50
50
|
|
51
51
|
class DbrxRouter(nn.Module):
|
sglang/srt/models/deepseek.py
CHANGED
@@ -28,13 +28,6 @@ from vllm.distributed import (
|
|
28
28
|
tensor_model_parallel_all_reduce,
|
29
29
|
)
|
30
30
|
from vllm.model_executor.layers.fused_moe import fused_moe
|
31
|
-
from vllm.model_executor.layers.linear import (
|
32
|
-
MergedColumnParallelLinear,
|
33
|
-
QKVParallelLinear,
|
34
|
-
ReplicatedLinear,
|
35
|
-
RowParallelLinear,
|
36
|
-
)
|
37
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
38
31
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
39
32
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
40
33
|
ParallelLMHead,
|
@@ -44,7 +37,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
44
37
|
|
45
38
|
from sglang.srt.layers.activation import SiluAndMul
|
46
39
|
from sglang.srt.layers.layernorm import RMSNorm
|
40
|
+
from sglang.srt.layers.linear import (
|
41
|
+
MergedColumnParallelLinear,
|
42
|
+
QKVParallelLinear,
|
43
|
+
ReplicatedLinear,
|
44
|
+
RowParallelLinear,
|
45
|
+
)
|
47
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
47
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
48
48
|
from sglang.srt.layers.radix_attention import RadixAttention
|
49
49
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
50
50
|
|
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -27,13 +27,6 @@ from vllm.distributed import (
|
|
27
27
|
tensor_model_parallel_all_reduce,
|
28
28
|
)
|
29
29
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
30
|
-
from vllm.model_executor.layers.linear import (
|
31
|
-
ColumnParallelLinear,
|
32
|
-
MergedColumnParallelLinear,
|
33
|
-
ReplicatedLinear,
|
34
|
-
RowParallelLinear,
|
35
|
-
)
|
36
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
37
30
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
38
31
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
39
32
|
ParallelLMHead,
|
@@ -43,7 +36,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
43
36
|
|
44
37
|
from sglang.srt.layers.activation import SiluAndMul
|
45
38
|
from sglang.srt.layers.layernorm import RMSNorm
|
39
|
+
from sglang.srt.layers.linear import (
|
40
|
+
ColumnParallelLinear,
|
41
|
+
MergedColumnParallelLinear,
|
42
|
+
ReplicatedLinear,
|
43
|
+
RowParallelLinear,
|
44
|
+
)
|
46
45
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
46
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
47
47
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
48
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
49
49
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
@@ -507,7 +507,7 @@ class DeepseekV2DecoderLayer(nn.Module):
|
|
507
507
|
rope_theta = getattr(config, "rope_theta", 10000)
|
508
508
|
rope_scaling = getattr(config, "rope_scaling", None)
|
509
509
|
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
510
|
-
if global_server_args_dict["
|
510
|
+
if not global_server_args_dict["disable_mla"]:
|
511
511
|
self.self_attn = DeepseekV2AttentionMLA(
|
512
512
|
config=config,
|
513
513
|
hidden_size=self.hidden_size,
|
@@ -732,7 +732,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
732
732
|
)
|
733
733
|
weight_loader(param, loaded_weight)
|
734
734
|
|
735
|
-
if global_server_args_dict["
|
735
|
+
if not global_server_args_dict["disable_mla"]:
|
736
736
|
for layer_id in range(self.config.num_hidden_layers):
|
737
737
|
self_attn = self.model.layers[layer_id].self_attn
|
738
738
|
w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
|
sglang/srt/models/exaone.py
CHANGED
@@ -23,12 +23,6 @@ import torch
|
|
23
23
|
from torch import nn
|
24
24
|
from vllm.config import CacheConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.linear import (
|
27
|
-
MergedColumnParallelLinear,
|
28
|
-
QKVParallelLinear,
|
29
|
-
RowParallelLinear,
|
30
|
-
)
|
31
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
32
26
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
33
27
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
34
28
|
ParallelLMHead,
|
@@ -38,7 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
38
32
|
|
39
33
|
from sglang.srt.layers.activation import SiluAndMul
|
40
34
|
from sglang.srt.layers.layernorm import RMSNorm
|
35
|
+
from sglang.srt.layers.linear import (
|
36
|
+
MergedColumnParallelLinear,
|
37
|
+
QKVParallelLinear,
|
38
|
+
RowParallelLinear,
|
39
|
+
)
|
41
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
41
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
44
|
|
sglang/srt/models/gemma.py
CHANGED
@@ -23,19 +23,19 @@ from torch import nn
|
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
from vllm.config import CacheConfig, LoRAConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.linear import (
|
27
|
-
MergedColumnParallelLinear,
|
28
|
-
QKVParallelLinear,
|
29
|
-
RowParallelLinear,
|
30
|
-
)
|
31
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
32
26
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
33
27
|
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
34
28
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
35
29
|
|
36
30
|
from sglang.srt.layers.activation import GeluAndMul
|
37
31
|
from sglang.srt.layers.layernorm import RMSNorm
|
32
|
+
from sglang.srt.layers.linear import (
|
33
|
+
MergedColumnParallelLinear,
|
34
|
+
QKVParallelLinear,
|
35
|
+
RowParallelLinear,
|
36
|
+
)
|
38
37
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
38
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
39
39
|
from sglang.srt.layers.radix_attention import RadixAttention
|
40
40
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
41
41
|
|
sglang/srt/models/gemma2.py
CHANGED
@@ -22,12 +22,6 @@ from torch import nn
|
|
22
22
|
from transformers import PretrainedConfig
|
23
23
|
from vllm.config import CacheConfig, LoRAConfig
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
|
-
from vllm.model_executor.layers.linear import (
|
26
|
-
MergedColumnParallelLinear,
|
27
|
-
QKVParallelLinear,
|
28
|
-
RowParallelLinear,
|
29
|
-
)
|
30
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
31
25
|
|
32
26
|
# from vllm.model_executor.layers.rotary_embedding import GemmaRotaryEmbedding
|
33
27
|
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
@@ -35,7 +29,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
35
29
|
|
36
30
|
from sglang.srt.layers.activation import GeluAndMul
|
37
31
|
from sglang.srt.layers.layernorm import GemmaRMSNorm
|
32
|
+
from sglang.srt.layers.linear import (
|
33
|
+
MergedColumnParallelLinear,
|
34
|
+
QKVParallelLinear,
|
35
|
+
RowParallelLinear,
|
36
|
+
)
|
38
37
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
38
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
39
39
|
from sglang.srt.layers.radix_attention import RadixAttention
|
40
40
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
41
41
|
|
sglang/srt/models/gpt_bigcode.py
CHANGED
@@ -23,17 +23,17 @@ from torch import nn
|
|
23
23
|
from transformers import GPTBigCodeConfig
|
24
24
|
from vllm.config import CacheConfig, LoRAConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.linear import (
|
27
|
-
ColumnParallelLinear,
|
28
|
-
QKVParallelLinear,
|
29
|
-
RowParallelLinear,
|
30
|
-
)
|
31
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
32
26
|
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
33
27
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
34
28
|
|
35
29
|
from sglang.srt.layers.activation import get_act_fn
|
30
|
+
from sglang.srt.layers.linear import (
|
31
|
+
ColumnParallelLinear,
|
32
|
+
QKVParallelLinear,
|
33
|
+
RowParallelLinear,
|
34
|
+
)
|
36
35
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
36
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
37
37
|
from sglang.srt.layers.radix_attention import RadixAttention
|
38
38
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
39
39
|
|
sglang/srt/models/grok.py
CHANGED
@@ -28,12 +28,6 @@ from vllm.distributed import (
|
|
28
28
|
get_tensor_model_parallel_rank,
|
29
29
|
get_tensor_model_parallel_world_size,
|
30
30
|
)
|
31
|
-
from vllm.model_executor.layers.linear import (
|
32
|
-
QKVParallelLinear,
|
33
|
-
ReplicatedLinear,
|
34
|
-
RowParallelLinear,
|
35
|
-
)
|
36
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
37
31
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
38
32
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
39
33
|
ParallelLMHead,
|
@@ -44,7 +38,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
44
38
|
|
45
39
|
from sglang.srt.layers.fused_moe import FusedMoE
|
46
40
|
from sglang.srt.layers.layernorm import RMSNorm
|
41
|
+
from sglang.srt.layers.linear import (
|
42
|
+
QKVParallelLinear,
|
43
|
+
ReplicatedLinear,
|
44
|
+
RowParallelLinear,
|
45
|
+
)
|
47
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
47
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
48
48
|
from sglang.srt.layers.radix_attention import RadixAttention
|
49
49
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
50
50
|
|
sglang/srt/models/internlm2.py
CHANGED
@@ -23,12 +23,6 @@ from torch import nn
|
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
from vllm.config import CacheConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.linear import (
|
27
|
-
MergedColumnParallelLinear,
|
28
|
-
QKVParallelLinear,
|
29
|
-
RowParallelLinear,
|
30
|
-
)
|
31
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
32
26
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
33
27
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
34
28
|
ParallelLMHead,
|
@@ -38,7 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
38
32
|
|
39
33
|
from sglang.srt.layers.activation import SiluAndMul
|
40
34
|
from sglang.srt.layers.layernorm import RMSNorm
|
35
|
+
from sglang.srt.layers.linear import (
|
36
|
+
MergedColumnParallelLinear,
|
37
|
+
QKVParallelLinear,
|
38
|
+
RowParallelLinear,
|
39
|
+
)
|
41
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
44
|
|
sglang/srt/models/llama.py
CHANGED
@@ -24,12 +24,6 @@ from torch import nn
|
|
24
24
|
from transformers import LlamaConfig
|
25
25
|
from vllm.config import CacheConfig
|
26
26
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
27
|
-
from vllm.model_executor.layers.linear import (
|
28
|
-
MergedColumnParallelLinear,
|
29
|
-
QKVParallelLinear,
|
30
|
-
RowParallelLinear,
|
31
|
-
)
|
32
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
33
27
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
28
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
29
|
ParallelLMHead,
|
@@ -39,7 +33,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
39
33
|
|
40
34
|
from sglang.srt.layers.activation import SiluAndMul
|
41
35
|
from sglang.srt.layers.layernorm import RMSNorm
|
36
|
+
from sglang.srt.layers.linear import (
|
37
|
+
MergedColumnParallelLinear,
|
38
|
+
QKVParallelLinear,
|
39
|
+
RowParallelLinear,
|
40
|
+
)
|
42
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
42
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
44
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
45
45
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
@@ -305,8 +305,6 @@ class LlamaForCausalLM(nn.Module):
|
|
305
305
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
306
306
|
self.logits_processor = LogitsProcessor(config)
|
307
307
|
|
308
|
-
self.param_dict = dict(self.named_parameters())
|
309
|
-
|
310
308
|
@torch.no_grad()
|
311
309
|
def forward(
|
312
310
|
self,
|
@@ -374,7 +372,7 @@ class LlamaForCausalLM(nn.Module):
|
|
374
372
|
(".gate_up_proj", ".gate_proj", 0),
|
375
373
|
(".gate_up_proj", ".up_proj", 1),
|
376
374
|
]
|
377
|
-
params_dict = self.
|
375
|
+
params_dict = dict(self.named_parameters())
|
378
376
|
|
379
377
|
for name, loaded_weight in weights:
|
380
378
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
@@ -19,10 +19,10 @@ import torch
|
|
19
19
|
from torch import nn
|
20
20
|
from transformers import LlamaConfig
|
21
21
|
from vllm.config import CacheConfig
|
22
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
23
22
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
24
23
|
|
25
24
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
25
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
26
26
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
27
27
|
from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
|
28
28
|
|
@@ -36,6 +36,7 @@ class LlamaForClassification(nn.Module):
|
|
36
36
|
) -> None:
|
37
37
|
super().__init__()
|
38
38
|
self.config = config
|
39
|
+
self.torchao_config = None
|
39
40
|
self.quant_config = quant_config
|
40
41
|
self.model = LlamaModel(config, quant_config=quant_config)
|
41
42
|
|
@@ -44,8 +45,6 @@ class LlamaForClassification(nn.Module):
|
|
44
45
|
)
|
45
46
|
self.eos_token_id = config.eos_token_id
|
46
47
|
|
47
|
-
self.param_dict = dict(self.named_parameters())
|
48
|
-
|
49
48
|
@torch.no_grad()
|
50
49
|
def forward(
|
51
50
|
self,
|
@@ -77,7 +76,7 @@ class LlamaForClassification(nn.Module):
|
|
77
76
|
return logits_output
|
78
77
|
|
79
78
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
80
|
-
params_dict = self.
|
79
|
+
params_dict = dict(self.named_parameters())
|
81
80
|
|
82
81
|
for name, loaded_weight in weights:
|
83
82
|
if "classification_head" in name:
|
sglang/srt/models/llava.py
CHANGED
@@ -32,9 +32,9 @@ from transformers import (
|
|
32
32
|
)
|
33
33
|
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
34
34
|
from vllm.config import CacheConfig
|
35
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
36
35
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
37
36
|
|
37
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
38
38
|
from sglang.srt.mm_utils import (
|
39
39
|
get_anyres_image_grid_shape,
|
40
40
|
unpad_image,
|
sglang/srt/models/llavavid.py
CHANGED
@@ -23,9 +23,9 @@ from torch import nn
|
|
23
23
|
from transformers import CLIPVisionModel, LlavaConfig
|
24
24
|
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
25
25
|
from vllm.config import CacheConfig
|
26
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
27
26
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
28
27
|
|
28
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
29
29
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
|
30
30
|
from sglang.srt.models.llama import LlamaForCausalLM
|
31
31
|
|
sglang/srt/models/minicpm.py
CHANGED
@@ -22,12 +22,6 @@ import torch
|
|
22
22
|
from torch import nn
|
23
23
|
from vllm.config import CacheConfig
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
|
-
from vllm.model_executor.layers.linear import (
|
26
|
-
MergedColumnParallelLinear,
|
27
|
-
QKVParallelLinear,
|
28
|
-
RowParallelLinear,
|
29
|
-
)
|
30
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
31
25
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
26
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
27
|
ParallelLMHead,
|
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
37
31
|
|
38
32
|
from sglang.srt.layers.activation import SiluAndMul
|
39
33
|
from sglang.srt.layers.layernorm import RMSNorm
|
34
|
+
from sglang.srt.layers.linear import (
|
35
|
+
MergedColumnParallelLinear,
|
36
|
+
QKVParallelLinear,
|
37
|
+
RowParallelLinear,
|
38
|
+
)
|
40
39
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
40
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
42
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
43
43
|
|
sglang/srt/models/minicpm3.py
CHANGED
@@ -29,7 +29,6 @@ from vllm.model_executor.layers.linear import (
|
|
29
29
|
ReplicatedLinear,
|
30
30
|
RowParallelLinear,
|
31
31
|
)
|
32
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
33
32
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
33
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
34
|
ParallelLMHead,
|
@@ -40,6 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
40
39
|
from sglang.srt.layers.activation import SiluAndMul
|
41
40
|
from sglang.srt.layers.layernorm import RMSNorm
|
42
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
44
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
45
45
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
@@ -419,7 +419,7 @@ class MiniCPM3DecoderLayer(nn.Module):
|
|
419
419
|
rope_theta = getattr(config, "rope_theta", 10000)
|
420
420
|
rope_scaling = getattr(config, "rope_scaling", None)
|
421
421
|
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
422
|
-
if global_server_args_dict["
|
422
|
+
if not global_server_args_dict["disable_mla"]:
|
423
423
|
self.self_attn = MiniCPM3AttentionMLA(
|
424
424
|
config=config,
|
425
425
|
hidden_size=self.hidden_size,
|
@@ -653,7 +653,7 @@ class MiniCPM3ForCausalLM(nn.Module):
|
|
653
653
|
)
|
654
654
|
weight_loader(param, loaded_weight)
|
655
655
|
|
656
|
-
if global_server_args_dict["
|
656
|
+
if not global_server_args_dict["disable_mla"]:
|
657
657
|
for layer_id in range(self.config.num_hidden_layers):
|
658
658
|
self_attn = self.model.layers[layer_id].self_attn
|
659
659
|
w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
|
sglang/srt/models/mixtral.py
CHANGED
@@ -24,12 +24,6 @@ from transformers import MixtralConfig
|
|
24
24
|
from vllm.config import CacheConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
26
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
27
|
-
from vllm.model_executor.layers.linear import (
|
28
|
-
QKVParallelLinear,
|
29
|
-
ReplicatedLinear,
|
30
|
-
RowParallelLinear,
|
31
|
-
)
|
32
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
33
27
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
28
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
29
|
DEFAULT_VOCAB_PADDING_SIZE,
|
@@ -39,7 +33,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
39
33
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
40
34
|
|
41
35
|
from sglang.srt.layers.layernorm import RMSNorm
|
36
|
+
from sglang.srt.layers.linear import (
|
37
|
+
QKVParallelLinear,
|
38
|
+
ReplicatedLinear,
|
39
|
+
RowParallelLinear,
|
40
|
+
)
|
42
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
44
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
45
45
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
@@ -29,12 +29,6 @@ from vllm.distributed import (
|
|
29
29
|
get_tensor_model_parallel_world_size,
|
30
30
|
tensor_model_parallel_all_reduce,
|
31
31
|
)
|
32
|
-
from vllm.model_executor.layers.linear import (
|
33
|
-
QKVParallelLinear,
|
34
|
-
ReplicatedLinear,
|
35
|
-
RowParallelLinear,
|
36
|
-
)
|
37
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
38
32
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
39
33
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
40
34
|
ParallelLMHead,
|
@@ -43,7 +37,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
43
37
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
44
38
|
|
45
39
|
from sglang.srt.layers.layernorm import RMSNorm
|
40
|
+
from sglang.srt.layers.linear import (
|
41
|
+
QKVParallelLinear,
|
42
|
+
ReplicatedLinear,
|
43
|
+
RowParallelLinear,
|
44
|
+
)
|
46
45
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
46
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
47
47
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
48
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
49
49
|
|
sglang/srt/models/olmoe.py
CHANGED
@@ -35,7 +35,6 @@ from vllm.model_executor.layers.linear import (
|
|
35
35
|
ReplicatedLinear,
|
36
36
|
RowParallelLinear,
|
37
37
|
)
|
38
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
39
38
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
40
39
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
41
40
|
ParallelLMHead,
|
@@ -47,6 +46,7 @@ from vllm.utils import print_warning_once
|
|
47
46
|
from sglang.srt.layers.activation import SiluAndMul
|
48
47
|
from sglang.srt.layers.layernorm import RMSNorm
|
49
48
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
49
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
50
50
|
from sglang.srt.layers.radix_attention import RadixAttention
|
51
51
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
52
52
|
|
sglang/srt/models/qwen.py
CHANGED
@@ -22,12 +22,6 @@ from torch import nn
|
|
22
22
|
from transformers import PretrainedConfig
|
23
23
|
from vllm.config import CacheConfig
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
|
-
from vllm.model_executor.layers.linear import (
|
26
|
-
MergedColumnParallelLinear,
|
27
|
-
QKVParallelLinear,
|
28
|
-
RowParallelLinear,
|
29
|
-
)
|
30
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
31
25
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
26
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
27
|
ParallelLMHead,
|
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
37
31
|
|
38
32
|
from sglang.srt.layers.activation import SiluAndMul
|
39
33
|
from sglang.srt.layers.layernorm import RMSNorm
|
34
|
+
from sglang.srt.layers.linear import (
|
35
|
+
MergedColumnParallelLinear,
|
36
|
+
QKVParallelLinear,
|
37
|
+
RowParallelLinear,
|
38
|
+
)
|
40
39
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
40
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
42
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
43
43
|
|
sglang/srt/models/qwen2.py
CHANGED
@@ -22,12 +22,6 @@ import torch
|
|
22
22
|
from torch import nn
|
23
23
|
from vllm.config import CacheConfig
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
|
-
from vllm.model_executor.layers.linear import (
|
26
|
-
MergedColumnParallelLinear,
|
27
|
-
QKVParallelLinear,
|
28
|
-
RowParallelLinear,
|
29
|
-
)
|
30
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
31
25
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
26
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
27
|
ParallelLMHead,
|
@@ -37,8 +31,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
37
31
|
|
38
32
|
from sglang.srt.layers.activation import SiluAndMul
|
39
33
|
from sglang.srt.layers.layernorm import RMSNorm
|
34
|
+
from sglang.srt.layers.linear import (
|
35
|
+
MergedColumnParallelLinear,
|
36
|
+
QKVParallelLinear,
|
37
|
+
RowParallelLinear,
|
38
|
+
)
|
40
39
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
40
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
41
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
44
|
|
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -29,13 +29,6 @@ from vllm.distributed import (
|
|
29
29
|
tensor_model_parallel_all_reduce,
|
30
30
|
)
|
31
31
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
32
|
-
from vllm.model_executor.layers.linear import (
|
33
|
-
MergedColumnParallelLinear,
|
34
|
-
QKVParallelLinear,
|
35
|
-
ReplicatedLinear,
|
36
|
-
RowParallelLinear,
|
37
|
-
)
|
38
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
39
32
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
40
33
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
41
34
|
ParallelLMHead,
|
@@ -45,7 +38,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
45
38
|
|
46
39
|
from sglang.srt.layers.activation import SiluAndMul
|
47
40
|
from sglang.srt.layers.layernorm import RMSNorm
|
41
|
+
from sglang.srt.layers.linear import (
|
42
|
+
MergedColumnParallelLinear,
|
43
|
+
QKVParallelLinear,
|
44
|
+
ReplicatedLinear,
|
45
|
+
RowParallelLinear,
|
46
|
+
)
|
48
47
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
48
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
49
49
|
from sglang.srt.layers.radix_attention import RadixAttention
|
50
50
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
51
51
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
sglang/srt/models/stablelm.py
CHANGED
@@ -24,12 +24,6 @@ from torch import nn
|
|
24
24
|
from transformers import PretrainedConfig
|
25
25
|
from vllm.config import CacheConfig
|
26
26
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
27
|
-
from vllm.model_executor.layers.linear import (
|
28
|
-
MergedColumnParallelLinear,
|
29
|
-
QKVParallelLinear,
|
30
|
-
RowParallelLinear,
|
31
|
-
)
|
32
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
33
27
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
28
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
29
|
ParallelLMHead,
|
@@ -38,7 +32,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
38
32
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
39
33
|
|
40
34
|
from sglang.srt.layers.activation import SiluAndMul
|
35
|
+
from sglang.srt.layers.linear import (
|
36
|
+
MergedColumnParallelLinear,
|
37
|
+
QKVParallelLinear,
|
38
|
+
RowParallelLinear,
|
39
|
+
)
|
41
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
44
|
|
sglang/srt/models/xverse.py
CHANGED
@@ -31,7 +31,6 @@ from vllm.model_executor.layers.linear import (
|
|
31
31
|
QKVParallelLinear,
|
32
32
|
RowParallelLinear,
|
33
33
|
)
|
34
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
35
34
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
36
35
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
37
36
|
ParallelLMHead,
|
@@ -40,6 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
40
39
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
41
40
|
|
42
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
44
|
from sglang.srt.model_executor.model_runner import InputMetadata
|
45
45
|
|
@@ -307,8 +307,6 @@ class XverseForCausalLM(nn.Module):
|
|
307
307
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
308
308
|
self.logits_processor = LogitsProcessor(config)
|
309
309
|
|
310
|
-
self.param_dict = dict(self.named_parameters())
|
311
|
-
|
312
310
|
@torch.no_grad()
|
313
311
|
def forward(
|
314
312
|
self,
|
@@ -333,7 +331,7 @@ class XverseForCausalLM(nn.Module):
|
|
333
331
|
("gate_up_proj", "gate_proj", 0),
|
334
332
|
("gate_up_proj", "up_proj", 1),
|
335
333
|
]
|
336
|
-
params_dict = self.
|
334
|
+
params_dict = dict(self.named_parameters())
|
337
335
|
|
338
336
|
def load_weights_per_param(name, loaded_weight):
|
339
337
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|