sglang 0.3.1.post2__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +12 -11
- sglang/bench_server_latency.py +0 -6
- sglang/srt/hf_transformers_utils.py +1 -0
- sglang/srt/layers/activation.py +3 -2
- sglang/srt/layers/attention_backend.py +6 -12
- sglang/srt/layers/fused_moe/patch.py +117 -0
- sglang/srt/layers/linear.py +1133 -0
- sglang/srt/layers/quantization/__init__.py +76 -0
- sglang/srt/layers/quantization/base_config.py +122 -0
- sglang/srt/managers/schedule_batch.py +3 -5
- sglang/srt/managers/tokenizer_manager.py +1 -0
- sglang/srt/managers/tp_worker.py +1 -1
- sglang/srt/mem_cache/radix_cache.py +5 -5
- sglang/srt/model_executor/cuda_graph_runner.py +10 -6
- sglang/srt/model_executor/forward_batch_info.py +2 -4
- sglang/srt/model_executor/model_runner.py +0 -3
- sglang/srt/models/baichuan.py +1 -1
- sglang/srt/models/chatglm.py +6 -6
- sglang/srt/models/commandr.py +7 -7
- sglang/srt/models/dbrx.py +7 -7
- sglang/srt/models/deepseek.py +7 -7
- sglang/srt/models/deepseek_v2.py +7 -7
- sglang/srt/models/exaone.py +6 -6
- sglang/srt/models/gemma.py +6 -6
- sglang/srt/models/gemma2.py +6 -6
- sglang/srt/models/gpt_bigcode.py +6 -6
- sglang/srt/models/grok.py +6 -6
- sglang/srt/models/internlm2.py +6 -6
- sglang/srt/models/llama.py +14 -6
- sglang/srt/models/llama_classification.py +1 -1
- sglang/srt/models/llava.py +1 -1
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +6 -6
- sglang/srt/models/minicpm3.py +1 -1
- sglang/srt/models/mixtral.py +6 -6
- sglang/srt/models/mixtral_quant.py +6 -6
- sglang/srt/models/olmoe.py +1 -1
- sglang/srt/models/qwen.py +6 -6
- sglang/srt/models/qwen2.py +6 -6
- sglang/srt/models/qwen2_moe.py +7 -7
- sglang/srt/models/stablelm.py +6 -6
- sglang/srt/models/xverse.py +1 -1
- sglang/srt/models/xverse_moe.py +1 -1
- sglang/srt/models/yivl.py +1 -1
- sglang/srt/openai_api/adapter.py +7 -0
- sglang/srt/utils.py +21 -1
- sglang/test/runners.py +7 -9
- sglang/test/test_utils.py +39 -2
- sglang/version.py +1 -1
- {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/METADATA +8 -6
- {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/RECORD +54 -50
- {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/LICENSE +0 -0
- {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/WHEEL +0 -0
- {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/top_level.txt +0 -0
sglang/srt/models/grok.py
CHANGED
@@ -28,12 +28,6 @@ from vllm.distributed import (
|
|
28
28
|
get_tensor_model_parallel_rank,
|
29
29
|
get_tensor_model_parallel_world_size,
|
30
30
|
)
|
31
|
-
from vllm.model_executor.layers.linear import (
|
32
|
-
QKVParallelLinear,
|
33
|
-
ReplicatedLinear,
|
34
|
-
RowParallelLinear,
|
35
|
-
)
|
36
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
37
31
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
38
32
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
39
33
|
ParallelLMHead,
|
@@ -44,7 +38,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
44
38
|
|
45
39
|
from sglang.srt.layers.fused_moe import FusedMoE
|
46
40
|
from sglang.srt.layers.layernorm import RMSNorm
|
41
|
+
from sglang.srt.layers.linear import (
|
42
|
+
QKVParallelLinear,
|
43
|
+
ReplicatedLinear,
|
44
|
+
RowParallelLinear,
|
45
|
+
)
|
47
46
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
47
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
48
48
|
from sglang.srt.layers.radix_attention import RadixAttention
|
49
49
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
50
50
|
|
sglang/srt/models/internlm2.py
CHANGED
@@ -23,12 +23,6 @@ from torch import nn
|
|
23
23
|
from transformers import PretrainedConfig
|
24
24
|
from vllm.config import CacheConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
|
-
from vllm.model_executor.layers.linear import (
|
27
|
-
MergedColumnParallelLinear,
|
28
|
-
QKVParallelLinear,
|
29
|
-
RowParallelLinear,
|
30
|
-
)
|
31
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
32
26
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
33
27
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
34
28
|
ParallelLMHead,
|
@@ -38,7 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
38
32
|
|
39
33
|
from sglang.srt.layers.activation import SiluAndMul
|
40
34
|
from sglang.srt.layers.layernorm import RMSNorm
|
35
|
+
from sglang.srt.layers.linear import (
|
36
|
+
MergedColumnParallelLinear,
|
37
|
+
QKVParallelLinear,
|
38
|
+
RowParallelLinear,
|
39
|
+
)
|
41
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
44
|
|
sglang/srt/models/llama.py
CHANGED
@@ -24,12 +24,6 @@ from torch import nn
|
|
24
24
|
from transformers import LlamaConfig
|
25
25
|
from vllm.config import CacheConfig
|
26
26
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
27
|
-
from vllm.model_executor.layers.linear import (
|
28
|
-
MergedColumnParallelLinear,
|
29
|
-
QKVParallelLinear,
|
30
|
-
RowParallelLinear,
|
31
|
-
)
|
32
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
33
27
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
28
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
29
|
ParallelLMHead,
|
@@ -39,7 +33,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
39
33
|
|
40
34
|
from sglang.srt.layers.activation import SiluAndMul
|
41
35
|
from sglang.srt.layers.layernorm import RMSNorm
|
36
|
+
from sglang.srt.layers.linear import (
|
37
|
+
MergedColumnParallelLinear,
|
38
|
+
QKVParallelLinear,
|
39
|
+
RowParallelLinear,
|
40
|
+
)
|
42
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
42
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
44
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
45
45
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
@@ -403,6 +403,14 @@ class LlamaForCausalLM(nn.Module):
|
|
403
403
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
404
404
|
weight_loader(param, loaded_weight)
|
405
405
|
|
406
|
+
if (
|
407
|
+
hasattr(self.config, "tie_word_embeddings")
|
408
|
+
and self.config.tie_word_embeddings
|
409
|
+
):
|
410
|
+
# Tie output embedding layer to input embedding layer, to solve issues where lm_head.weight is missing
|
411
|
+
param = self.lm_head.weight
|
412
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
413
|
+
weight_loader(param, self.model.embed_tokens.weight)
|
406
414
|
apply_torchao_config_(self, params_dict, set(["proj.weight"]))
|
407
415
|
|
408
416
|
|
@@ -19,10 +19,10 @@ import torch
|
|
19
19
|
from torch import nn
|
20
20
|
from transformers import LlamaConfig
|
21
21
|
from vllm.config import CacheConfig
|
22
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
23
22
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
24
23
|
|
25
24
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
25
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
26
26
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
27
27
|
from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
|
28
28
|
|
sglang/srt/models/llava.py
CHANGED
@@ -32,9 +32,9 @@ from transformers import (
|
|
32
32
|
)
|
33
33
|
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
34
34
|
from vllm.config import CacheConfig
|
35
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
36
35
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
37
36
|
|
37
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
38
38
|
from sglang.srt.mm_utils import (
|
39
39
|
get_anyres_image_grid_shape,
|
40
40
|
unpad_image,
|
sglang/srt/models/llavavid.py
CHANGED
@@ -23,9 +23,9 @@ from torch import nn
|
|
23
23
|
from transformers import CLIPVisionModel, LlavaConfig
|
24
24
|
from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
|
25
25
|
from vllm.config import CacheConfig
|
26
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
27
26
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
28
27
|
|
28
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
29
29
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
|
30
30
|
from sglang.srt.models.llama import LlamaForCausalLM
|
31
31
|
|
sglang/srt/models/minicpm.py
CHANGED
@@ -22,12 +22,6 @@ import torch
|
|
22
22
|
from torch import nn
|
23
23
|
from vllm.config import CacheConfig
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
|
-
from vllm.model_executor.layers.linear import (
|
26
|
-
MergedColumnParallelLinear,
|
27
|
-
QKVParallelLinear,
|
28
|
-
RowParallelLinear,
|
29
|
-
)
|
30
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
31
25
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
26
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
27
|
ParallelLMHead,
|
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
37
31
|
|
38
32
|
from sglang.srt.layers.activation import SiluAndMul
|
39
33
|
from sglang.srt.layers.layernorm import RMSNorm
|
34
|
+
from sglang.srt.layers.linear import (
|
35
|
+
MergedColumnParallelLinear,
|
36
|
+
QKVParallelLinear,
|
37
|
+
RowParallelLinear,
|
38
|
+
)
|
40
39
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
40
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
42
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
43
43
|
|
sglang/srt/models/minicpm3.py
CHANGED
@@ -29,7 +29,6 @@ from vllm.model_executor.layers.linear import (
|
|
29
29
|
ReplicatedLinear,
|
30
30
|
RowParallelLinear,
|
31
31
|
)
|
32
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
33
32
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
33
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
34
|
ParallelLMHead,
|
@@ -40,6 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
40
39
|
from sglang.srt.layers.activation import SiluAndMul
|
41
40
|
from sglang.srt.layers.layernorm import RMSNorm
|
42
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
44
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
45
45
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
sglang/srt/models/mixtral.py
CHANGED
@@ -24,12 +24,6 @@ from transformers import MixtralConfig
|
|
24
24
|
from vllm.config import CacheConfig
|
25
25
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
26
26
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
27
|
-
from vllm.model_executor.layers.linear import (
|
28
|
-
QKVParallelLinear,
|
29
|
-
ReplicatedLinear,
|
30
|
-
RowParallelLinear,
|
31
|
-
)
|
32
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
33
27
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
28
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
29
|
DEFAULT_VOCAB_PADDING_SIZE,
|
@@ -39,7 +33,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
39
33
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
40
34
|
|
41
35
|
from sglang.srt.layers.layernorm import RMSNorm
|
36
|
+
from sglang.srt.layers.linear import (
|
37
|
+
QKVParallelLinear,
|
38
|
+
ReplicatedLinear,
|
39
|
+
RowParallelLinear,
|
40
|
+
)
|
42
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
44
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
45
45
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
@@ -29,12 +29,6 @@ from vllm.distributed import (
|
|
29
29
|
get_tensor_model_parallel_world_size,
|
30
30
|
tensor_model_parallel_all_reduce,
|
31
31
|
)
|
32
|
-
from vllm.model_executor.layers.linear import (
|
33
|
-
QKVParallelLinear,
|
34
|
-
ReplicatedLinear,
|
35
|
-
RowParallelLinear,
|
36
|
-
)
|
37
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
38
32
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
39
33
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
40
34
|
ParallelLMHead,
|
@@ -43,7 +37,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
43
37
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
44
38
|
|
45
39
|
from sglang.srt.layers.layernorm import RMSNorm
|
40
|
+
from sglang.srt.layers.linear import (
|
41
|
+
QKVParallelLinear,
|
42
|
+
ReplicatedLinear,
|
43
|
+
RowParallelLinear,
|
44
|
+
)
|
46
45
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
46
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
47
47
|
from sglang.srt.layers.radix_attention import RadixAttention
|
48
48
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
49
49
|
|
sglang/srt/models/olmoe.py
CHANGED
@@ -35,7 +35,6 @@ from vllm.model_executor.layers.linear import (
|
|
35
35
|
ReplicatedLinear,
|
36
36
|
RowParallelLinear,
|
37
37
|
)
|
38
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
39
38
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
40
39
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
41
40
|
ParallelLMHead,
|
@@ -47,6 +46,7 @@ from vllm.utils import print_warning_once
|
|
47
46
|
from sglang.srt.layers.activation import SiluAndMul
|
48
47
|
from sglang.srt.layers.layernorm import RMSNorm
|
49
48
|
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
49
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
50
50
|
from sglang.srt.layers.radix_attention import RadixAttention
|
51
51
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
52
52
|
|
sglang/srt/models/qwen.py
CHANGED
@@ -22,12 +22,6 @@ from torch import nn
|
|
22
22
|
from transformers import PretrainedConfig
|
23
23
|
from vllm.config import CacheConfig
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
|
-
from vllm.model_executor.layers.linear import (
|
26
|
-
MergedColumnParallelLinear,
|
27
|
-
QKVParallelLinear,
|
28
|
-
RowParallelLinear,
|
29
|
-
)
|
30
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
31
25
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
26
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
27
|
ParallelLMHead,
|
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
37
31
|
|
38
32
|
from sglang.srt.layers.activation import SiluAndMul
|
39
33
|
from sglang.srt.layers.layernorm import RMSNorm
|
34
|
+
from sglang.srt.layers.linear import (
|
35
|
+
MergedColumnParallelLinear,
|
36
|
+
QKVParallelLinear,
|
37
|
+
RowParallelLinear,
|
38
|
+
)
|
40
39
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
40
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
41
|
from sglang.srt.layers.radix_attention import RadixAttention
|
42
42
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
43
43
|
|
sglang/srt/models/qwen2.py
CHANGED
@@ -22,12 +22,6 @@ import torch
|
|
22
22
|
from torch import nn
|
23
23
|
from vllm.config import CacheConfig
|
24
24
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
25
|
-
from vllm.model_executor.layers.linear import (
|
26
|
-
MergedColumnParallelLinear,
|
27
|
-
QKVParallelLinear,
|
28
|
-
RowParallelLinear,
|
29
|
-
)
|
30
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
31
25
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
32
26
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
33
27
|
ParallelLMHead,
|
@@ -37,8 +31,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
37
31
|
|
38
32
|
from sglang.srt.layers.activation import SiluAndMul
|
39
33
|
from sglang.srt.layers.layernorm import RMSNorm
|
34
|
+
from sglang.srt.layers.linear import (
|
35
|
+
MergedColumnParallelLinear,
|
36
|
+
QKVParallelLinear,
|
37
|
+
RowParallelLinear,
|
38
|
+
)
|
40
39
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
40
|
from sglang.srt.layers.pooler import Pooler, PoolingType
|
41
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
44
|
|
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -29,13 +29,6 @@ from vllm.distributed import (
|
|
29
29
|
tensor_model_parallel_all_reduce,
|
30
30
|
)
|
31
31
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
32
|
-
from vllm.model_executor.layers.linear import (
|
33
|
-
MergedColumnParallelLinear,
|
34
|
-
QKVParallelLinear,
|
35
|
-
ReplicatedLinear,
|
36
|
-
RowParallelLinear,
|
37
|
-
)
|
38
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
39
32
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
40
33
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
41
34
|
ParallelLMHead,
|
@@ -45,7 +38,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|
45
38
|
|
46
39
|
from sglang.srt.layers.activation import SiluAndMul
|
47
40
|
from sglang.srt.layers.layernorm import RMSNorm
|
41
|
+
from sglang.srt.layers.linear import (
|
42
|
+
MergedColumnParallelLinear,
|
43
|
+
QKVParallelLinear,
|
44
|
+
ReplicatedLinear,
|
45
|
+
RowParallelLinear,
|
46
|
+
)
|
48
47
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
48
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
49
49
|
from sglang.srt.layers.radix_attention import RadixAttention
|
50
50
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_
|
51
51
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
sglang/srt/models/stablelm.py
CHANGED
@@ -24,12 +24,6 @@ from torch import nn
|
|
24
24
|
from transformers import PretrainedConfig
|
25
25
|
from vllm.config import CacheConfig
|
26
26
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
27
|
-
from vllm.model_executor.layers.linear import (
|
28
|
-
MergedColumnParallelLinear,
|
29
|
-
QKVParallelLinear,
|
30
|
-
RowParallelLinear,
|
31
|
-
)
|
32
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
33
27
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
34
28
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
35
29
|
ParallelLMHead,
|
@@ -38,7 +32,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
38
32
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
39
33
|
|
40
34
|
from sglang.srt.layers.activation import SiluAndMul
|
35
|
+
from sglang.srt.layers.linear import (
|
36
|
+
MergedColumnParallelLinear,
|
37
|
+
QKVParallelLinear,
|
38
|
+
RowParallelLinear,
|
39
|
+
)
|
41
40
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
41
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
42
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
43
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
44
44
|
|
sglang/srt/models/xverse.py
CHANGED
@@ -31,7 +31,6 @@ from vllm.model_executor.layers.linear import (
|
|
31
31
|
QKVParallelLinear,
|
32
32
|
RowParallelLinear,
|
33
33
|
)
|
34
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
35
34
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
36
35
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
37
36
|
ParallelLMHead,
|
@@ -40,6 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
40
39
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
41
40
|
|
42
41
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
42
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
43
43
|
from sglang.srt.layers.radix_attention import RadixAttention
|
44
44
|
from sglang.srt.model_executor.model_runner import InputMetadata
|
45
45
|
|
sglang/srt/models/xverse_moe.py
CHANGED
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (
|
|
34
34
|
ReplicatedLinear,
|
35
35
|
RowParallelLinear,
|
36
36
|
)
|
37
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
38
37
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
39
38
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
40
39
|
ParallelLMHead,
|
@@ -43,6 +42,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
|
|
43
42
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
44
43
|
|
45
44
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
45
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
46
46
|
from sglang.srt.layers.radix_attention import RadixAttention
|
47
47
|
from sglang.srt.model_executor.forward_batch_info import InputMetadata
|
48
48
|
|
sglang/srt/models/yivl.py
CHANGED
@@ -21,9 +21,9 @@ import torch
|
|
21
21
|
import torch.nn as nn
|
22
22
|
from transformers import CLIPVisionModel, LlavaConfig
|
23
23
|
from vllm.config import CacheConfig
|
24
|
-
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
25
24
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
26
25
|
|
26
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
27
27
|
from sglang.srt.models.llava import LlavaLlamaForCausalLM
|
28
28
|
|
29
29
|
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -858,11 +858,18 @@ def v1_chat_generate_request(
|
|
858
858
|
openai_compatible_messages.append(
|
859
859
|
{"role": message.role, "content": content["text"]}
|
860
860
|
)
|
861
|
+
if openai_compatible_messages[-1]["role"] == "assistant":
|
862
|
+
assistant_prefix = openai_compatible_messages[-1]["content"]
|
863
|
+
openai_compatible_messages = openai_compatible_messages[:-1]
|
864
|
+
else:
|
865
|
+
assistant_prefix = None
|
861
866
|
prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
|
862
867
|
openai_compatible_messages,
|
863
868
|
tokenize=True,
|
864
869
|
add_generation_prompt=True,
|
865
870
|
)
|
871
|
+
if assistant_prefix:
|
872
|
+
prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
|
866
873
|
stop = request.stop
|
867
874
|
image_data = None
|
868
875
|
modalities = []
|
sglang/srt/utils.py
CHANGED
@@ -26,7 +26,7 @@ import struct
|
|
26
26
|
import time
|
27
27
|
from importlib.metadata import PackageNotFoundError, version
|
28
28
|
from io import BytesIO
|
29
|
-
from typing import List, Optional, Union
|
29
|
+
from typing import Any, Dict, List, Optional, Union
|
30
30
|
|
31
31
|
import numpy as np
|
32
32
|
import psutil
|
@@ -682,3 +682,23 @@ def replace_submodule(
|
|
682
682
|
target_name = module_name.split(".")[-1]
|
683
683
|
setattr(parent, target_name, new_module)
|
684
684
|
return new_module
|
685
|
+
|
686
|
+
|
687
|
+
def set_weight_attrs(
|
688
|
+
weight: torch.Tensor,
|
689
|
+
weight_attrs: Optional[Dict[str, Any]],
|
690
|
+
):
|
691
|
+
"""Set attributes on a weight tensor.
|
692
|
+
|
693
|
+
This method is used to set attributes on a weight tensor. This method
|
694
|
+
will not overwrite existing attributes.
|
695
|
+
|
696
|
+
Args:
|
697
|
+
weight: The weight tensor.
|
698
|
+
weight_attrs: A dictionary of attributes to set on the weight tensor.
|
699
|
+
"""
|
700
|
+
if weight_attrs is None:
|
701
|
+
return
|
702
|
+
for key, value in weight_attrs.items():
|
703
|
+
assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
|
704
|
+
setattr(weight, key, value)
|
sglang/test/runners.py
CHANGED
@@ -21,19 +21,19 @@ from typing import List, Union
|
|
21
21
|
|
22
22
|
import torch
|
23
23
|
import torch.nn.functional as F
|
24
|
-
from
|
25
|
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
24
|
+
from transformers import AutoModelForCausalLM
|
26
25
|
|
26
|
+
from sglang.srt.hf_transformers_utils import get_tokenizer
|
27
27
|
from sglang.srt.server import Runtime
|
28
28
|
from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
|
29
29
|
|
30
30
|
DEFAULT_PROMPTS = [
|
31
|
-
# the output of gemma-2-2b from SRT is unstable on the commented prompt
|
32
|
-
# "The capital of France is",
|
33
31
|
"Apple is red. Banana is Yellow. " * 800 + "Apple is",
|
34
32
|
"The capital of the United Kingdom is",
|
35
33
|
"Today is a sunny day and I like",
|
36
34
|
"AI is a field of computer science focused on",
|
35
|
+
# the output of gemma-2-2b from SRT is unstable on the commented prompt
|
36
|
+
# "The capital of France is",
|
37
37
|
]
|
38
38
|
|
39
39
|
dirpath = os.path.dirname(__file__)
|
@@ -93,11 +93,7 @@ class HFRunner:
|
|
93
93
|
self.model_proc.start()
|
94
94
|
|
95
95
|
def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
|
96
|
-
self.tokenizer =
|
97
|
-
model_path,
|
98
|
-
torch_dtype=torch_dtype,
|
99
|
-
)
|
100
|
-
|
96
|
+
self.tokenizer = get_tokenizer(model_path)
|
101
97
|
if self.is_generation:
|
102
98
|
self.base_model = AutoModelForCausalLM.from_pretrained(
|
103
99
|
model_path,
|
@@ -132,6 +128,8 @@ class HFRunner:
|
|
132
128
|
input_ids = torch.tensor([p], device="cuda")
|
133
129
|
|
134
130
|
if lora_paths is not None and lora_paths[i] is not None:
|
131
|
+
from peft import PeftModel
|
132
|
+
|
135
133
|
self.model = PeftModel.from_pretrained(
|
136
134
|
self.base_model,
|
137
135
|
lora_paths[i],
|
sglang/test/test_utils.py
CHANGED
@@ -25,11 +25,14 @@ from sglang.utils import get_exception_traceback
|
|
25
25
|
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
26
26
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
27
27
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
28
|
+
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
29
|
+
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
28
30
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
29
31
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
30
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
32
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
31
33
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
32
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
|
34
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
35
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
33
36
|
|
34
37
|
|
35
38
|
def is_in_ci():
|
@@ -585,3 +588,37 @@ def run_bench_latency(model, other_args):
|
|
585
588
|
kill_child_process(process.pid)
|
586
589
|
|
587
590
|
return output_throughput
|
591
|
+
|
592
|
+
|
593
|
+
def lcs(X, Y):
|
594
|
+
m = len(X)
|
595
|
+
n = len(Y)
|
596
|
+
L = [[0] * (n + 1) for _ in range(m + 1)]
|
597
|
+
|
598
|
+
for i in range(m + 1):
|
599
|
+
for j in range(n + 1):
|
600
|
+
if i == 0 or j == 0:
|
601
|
+
L[i][j] = 0
|
602
|
+
elif X[i - 1] == Y[j - 1]:
|
603
|
+
L[i][j] = L[i - 1][j - 1] + 1
|
604
|
+
else:
|
605
|
+
L[i][j] = max(L[i - 1][j], L[i][j - 1])
|
606
|
+
|
607
|
+
return L[m][n]
|
608
|
+
|
609
|
+
|
610
|
+
def calculate_rouge_l(output_strs_list1, output_strs_list2):
|
611
|
+
"""calculate the ROUGE-L score"""
|
612
|
+
rouge_l_scores = []
|
613
|
+
|
614
|
+
for s1, s2 in zip(output_strs_list1, output_strs_list2):
|
615
|
+
lcs_len = lcs(s1, s2)
|
616
|
+
precision = lcs_len / len(s1) if len(s1) > 0 else 0
|
617
|
+
recall = lcs_len / len(s2) if len(s2) > 0 else 0
|
618
|
+
if precision + recall > 0:
|
619
|
+
fmeasure = (2 * precision * recall) / (precision + recall)
|
620
|
+
else:
|
621
|
+
fmeasure = 0.0
|
622
|
+
rouge_l_scores.append(fmeasure)
|
623
|
+
|
624
|
+
return rouge_l_scores
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.
|
1
|
+
__version__ = "0.3.2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
318
|
### Method 2: From source
|
319
319
|
```
|
320
320
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.
|
321
|
+
git clone -b v0.3.2 https://github.com/sgl-project/sglang.git
|
322
322
|
cd sglang
|
323
323
|
|
324
324
|
pip install --upgrade pip
|
@@ -348,9 +348,9 @@ docker run --gpus all \
|
|
348
348
|
<summary>More</summary>
|
349
349
|
|
350
350
|
> This method is recommended if you plan to serve it as a service.
|
351
|
-
> A better approach is to use the [k8s-sglang-service.yaml](
|
351
|
+
> A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
|
352
352
|
|
353
|
-
1. Copy the [compose.yml](
|
353
|
+
1. Copy the [compose.yml](docker/compose.yaml) to your local machine
|
354
354
|
2. Execute the command `docker compose up -d` in your terminal.
|
355
355
|
</details>
|
356
356
|
|
@@ -499,6 +499,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
499
499
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
500
500
|
- Mistral / Mixtral / Mistral NeMo
|
501
501
|
- Gemma / Gemma 2
|
502
|
+
- OLMoE
|
502
503
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
503
504
|
- DeepSeek / DeepSeek 2
|
504
505
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
@@ -520,6 +521,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
520
521
|
- BaiChuan2
|
521
522
|
- MiniCPM / MiniCPM 3
|
522
523
|
- XVERSE / XVERSE MoE
|
524
|
+
- SmolLM
|
523
525
|
|
524
526
|
|
525
527
|
**Embedding Models**
|
@@ -528,7 +530,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
528
530
|
- gte-Qwen2
|
529
531
|
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
530
532
|
|
531
|
-
Instructions for supporting a new model are [here](
|
533
|
+
Instructions for supporting a new model are [here](docs/en/model_support.md).
|
532
534
|
|
533
535
|
#### Use Models From ModelScope
|
534
536
|
<details>
|
@@ -823,7 +825,7 @@ def chat_example(s):
|
|
823
825
|
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
824
826
|
|
825
827
|
## Roadmap
|
826
|
-
[Development Roadmap (2024
|
828
|
+
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
827
829
|
|
828
830
|
## Citation And Acknowledgment
|
829
831
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|