sglang 0.3.1.post2__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. sglang/bench_latency.py +12 -11
  2. sglang/bench_server_latency.py +0 -6
  3. sglang/srt/hf_transformers_utils.py +1 -0
  4. sglang/srt/layers/activation.py +3 -2
  5. sglang/srt/layers/attention_backend.py +6 -12
  6. sglang/srt/layers/fused_moe/patch.py +117 -0
  7. sglang/srt/layers/linear.py +1133 -0
  8. sglang/srt/layers/quantization/__init__.py +76 -0
  9. sglang/srt/layers/quantization/base_config.py +122 -0
  10. sglang/srt/managers/schedule_batch.py +3 -5
  11. sglang/srt/managers/tokenizer_manager.py +1 -0
  12. sglang/srt/managers/tp_worker.py +1 -1
  13. sglang/srt/mem_cache/radix_cache.py +5 -5
  14. sglang/srt/model_executor/cuda_graph_runner.py +10 -6
  15. sglang/srt/model_executor/forward_batch_info.py +2 -4
  16. sglang/srt/model_executor/model_runner.py +0 -3
  17. sglang/srt/models/baichuan.py +1 -1
  18. sglang/srt/models/chatglm.py +6 -6
  19. sglang/srt/models/commandr.py +7 -7
  20. sglang/srt/models/dbrx.py +7 -7
  21. sglang/srt/models/deepseek.py +7 -7
  22. sglang/srt/models/deepseek_v2.py +7 -7
  23. sglang/srt/models/exaone.py +6 -6
  24. sglang/srt/models/gemma.py +6 -6
  25. sglang/srt/models/gemma2.py +6 -6
  26. sglang/srt/models/gpt_bigcode.py +6 -6
  27. sglang/srt/models/grok.py +6 -6
  28. sglang/srt/models/internlm2.py +6 -6
  29. sglang/srt/models/llama.py +14 -6
  30. sglang/srt/models/llama_classification.py +1 -1
  31. sglang/srt/models/llava.py +1 -1
  32. sglang/srt/models/llavavid.py +1 -1
  33. sglang/srt/models/minicpm.py +6 -6
  34. sglang/srt/models/minicpm3.py +1 -1
  35. sglang/srt/models/mixtral.py +6 -6
  36. sglang/srt/models/mixtral_quant.py +6 -6
  37. sglang/srt/models/olmoe.py +1 -1
  38. sglang/srt/models/qwen.py +6 -6
  39. sglang/srt/models/qwen2.py +6 -6
  40. sglang/srt/models/qwen2_moe.py +7 -7
  41. sglang/srt/models/stablelm.py +6 -6
  42. sglang/srt/models/xverse.py +1 -1
  43. sglang/srt/models/xverse_moe.py +1 -1
  44. sglang/srt/models/yivl.py +1 -1
  45. sglang/srt/openai_api/adapter.py +7 -0
  46. sglang/srt/utils.py +21 -1
  47. sglang/test/runners.py +7 -9
  48. sglang/test/test_utils.py +39 -2
  49. sglang/version.py +1 -1
  50. {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/METADATA +8 -6
  51. {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/RECORD +54 -50
  52. {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/LICENSE +0 -0
  53. {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/WHEEL +0 -0
  54. {sglang-0.3.1.post2.dist-info → sglang-0.3.2.dist-info}/top_level.txt +0 -0
sglang/srt/models/grok.py CHANGED
@@ -28,12 +28,6 @@ from vllm.distributed import (
28
28
  get_tensor_model_parallel_rank,
29
29
  get_tensor_model_parallel_world_size,
30
30
  )
31
- from vllm.model_executor.layers.linear import (
32
- QKVParallelLinear,
33
- ReplicatedLinear,
34
- RowParallelLinear,
35
- )
36
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
37
31
  from vllm.model_executor.layers.rotary_embedding import get_rope
38
32
  from vllm.model_executor.layers.vocab_parallel_embedding import (
39
33
  ParallelLMHead,
@@ -44,7 +38,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
38
 
45
39
  from sglang.srt.layers.fused_moe import FusedMoE
46
40
  from sglang.srt.layers.layernorm import RMSNorm
41
+ from sglang.srt.layers.linear import (
42
+ QKVParallelLinear,
43
+ ReplicatedLinear,
44
+ RowParallelLinear,
45
+ )
47
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
47
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
48
48
  from sglang.srt.layers.radix_attention import RadixAttention
49
49
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
50
50
 
@@ -23,12 +23,6 @@ from torch import nn
23
23
  from transformers import PretrainedConfig
24
24
  from vllm.config import CacheConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.linear import (
27
- MergedColumnParallelLinear,
28
- QKVParallelLinear,
29
- RowParallelLinear,
30
- )
31
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
32
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
33
27
  from vllm.model_executor.layers.vocab_parallel_embedding import (
34
28
  ParallelLMHead,
@@ -38,7 +32,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
32
 
39
33
  from sglang.srt.layers.activation import SiluAndMul
40
34
  from sglang.srt.layers.layernorm import RMSNorm
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
41
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -24,12 +24,6 @@ from torch import nn
24
24
  from transformers import LlamaConfig
25
25
  from vllm.config import CacheConfig
26
26
  from vllm.distributed import get_tensor_model_parallel_world_size
27
- from vllm.model_executor.layers.linear import (
28
- MergedColumnParallelLinear,
29
- QKVParallelLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  ParallelLMHead,
@@ -39,7 +33,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
33
 
40
34
  from sglang.srt.layers.activation import SiluAndMul
41
35
  from sglang.srt.layers.layernorm import RMSNorm
36
+ from sglang.srt.layers.linear import (
37
+ MergedColumnParallelLinear,
38
+ QKVParallelLinear,
39
+ RowParallelLinear,
40
+ )
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
45
45
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -403,6 +403,14 @@ class LlamaForCausalLM(nn.Module):
403
403
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
404
404
  weight_loader(param, loaded_weight)
405
405
 
406
+ if (
407
+ hasattr(self.config, "tie_word_embeddings")
408
+ and self.config.tie_word_embeddings
409
+ ):
410
+ # Tie output embedding layer to input embedding layer, to solve issues where lm_head.weight is missing
411
+ param = self.lm_head.weight
412
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
413
+ weight_loader(param, self.model.embed_tokens.weight)
406
414
  apply_torchao_config_(self, params_dict, set(["proj.weight"]))
407
415
 
408
416
 
@@ -19,10 +19,10 @@ import torch
19
19
  from torch import nn
20
20
  from transformers import LlamaConfig
21
21
  from vllm.config import CacheConfig
22
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
23
22
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
24
23
 
25
24
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
25
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
26
26
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
27
27
  from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
28
28
 
@@ -32,9 +32,9 @@ from transformers import (
32
32
  )
33
33
  from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
34
34
  from vllm.config import CacheConfig
35
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
36
35
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
36
 
37
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
38
38
  from sglang.srt.mm_utils import (
39
39
  get_anyres_image_grid_shape,
40
40
  unpad_image,
@@ -23,9 +23,9 @@ from torch import nn
23
23
  from transformers import CLIPVisionModel, LlavaConfig
24
24
  from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
25
25
  from vllm.config import CacheConfig
26
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
27
26
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
28
27
 
28
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
29
29
  from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
30
30
  from sglang.srt.models.llama import LlamaForCausalLM
31
31
 
@@ -22,12 +22,6 @@ import torch
22
22
  from torch import nn
23
23
  from vllm.config import CacheConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import (
33
27
  ParallelLMHead,
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
31
 
38
32
  from sglang.srt.layers.activation import SiluAndMul
39
33
  from sglang.srt.layers.layernorm import RMSNorm
34
+ from sglang.srt.layers.linear import (
35
+ MergedColumnParallelLinear,
36
+ QKVParallelLinear,
37
+ RowParallelLinear,
38
+ )
40
39
  from sglang.srt.layers.logits_processor import LogitsProcessor
40
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
42
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
43
 
@@ -29,7 +29,6 @@ from vllm.model_executor.layers.linear import (
29
29
  ReplicatedLinear,
30
30
  RowParallelLinear,
31
31
  )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
32
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
33
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
34
  ParallelLMHead,
@@ -40,6 +39,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
39
  from sglang.srt.layers.activation import SiluAndMul
41
40
  from sglang.srt.layers.layernorm import RMSNorm
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.managers.schedule_batch import global_server_args_dict
45
45
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
@@ -24,12 +24,6 @@ from transformers import MixtralConfig
24
24
  from vllm.config import CacheConfig
25
25
  from vllm.distributed import get_tensor_model_parallel_world_size
26
26
  from vllm.model_executor.layers.fused_moe import FusedMoE
27
- from vllm.model_executor.layers.linear import (
28
- QKVParallelLinear,
29
- ReplicatedLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  DEFAULT_VOCAB_PADDING_SIZE,
@@ -39,7 +33,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
39
33
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
40
34
 
41
35
  from sglang.srt.layers.layernorm import RMSNorm
36
+ from sglang.srt.layers.linear import (
37
+ QKVParallelLinear,
38
+ ReplicatedLinear,
39
+ RowParallelLinear,
40
+ )
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
45
45
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -29,12 +29,6 @@ from vllm.distributed import (
29
29
  get_tensor_model_parallel_world_size,
30
30
  tensor_model_parallel_all_reduce,
31
31
  )
32
- from vllm.model_executor.layers.linear import (
33
- QKVParallelLinear,
34
- ReplicatedLinear,
35
- RowParallelLinear,
36
- )
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
32
  from vllm.model_executor.layers.rotary_embedding import get_rope
39
33
  from vllm.model_executor.layers.vocab_parallel_embedding import (
40
34
  ParallelLMHead,
@@ -43,7 +37,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
43
37
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
38
 
45
39
  from sglang.srt.layers.layernorm import RMSNorm
40
+ from sglang.srt.layers.linear import (
41
+ QKVParallelLinear,
42
+ ReplicatedLinear,
43
+ RowParallelLinear,
44
+ )
46
45
  from sglang.srt.layers.logits_processor import LogitsProcessor
46
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
47
47
  from sglang.srt.layers.radix_attention import RadixAttention
48
48
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
49
49
 
@@ -35,7 +35,6 @@ from vllm.model_executor.layers.linear import (
35
35
  ReplicatedLinear,
36
36
  RowParallelLinear,
37
37
  )
38
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
39
38
  from vllm.model_executor.layers.rotary_embedding import get_rope
40
39
  from vllm.model_executor.layers.vocab_parallel_embedding import (
41
40
  ParallelLMHead,
@@ -47,6 +46,7 @@ from vllm.utils import print_warning_once
47
46
  from sglang.srt.layers.activation import SiluAndMul
48
47
  from sglang.srt.layers.layernorm import RMSNorm
49
48
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
49
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
50
50
  from sglang.srt.layers.radix_attention import RadixAttention
51
51
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
52
52
 
sglang/srt/models/qwen.py CHANGED
@@ -22,12 +22,6 @@ from torch import nn
22
22
  from transformers import PretrainedConfig
23
23
  from vllm.config import CacheConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import (
33
27
  ParallelLMHead,
@@ -37,7 +31,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
31
 
38
32
  from sglang.srt.layers.activation import SiluAndMul
39
33
  from sglang.srt.layers.layernorm import RMSNorm
34
+ from sglang.srt.layers.linear import (
35
+ MergedColumnParallelLinear,
36
+ QKVParallelLinear,
37
+ RowParallelLinear,
38
+ )
40
39
  from sglang.srt.layers.logits_processor import LogitsProcessor
40
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
42
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
43
43
 
@@ -22,12 +22,6 @@ import torch
22
22
  from torch import nn
23
23
  from vllm.config import CacheConfig
24
24
  from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.linear import (
26
- MergedColumnParallelLinear,
27
- QKVParallelLinear,
28
- RowParallelLinear,
29
- )
30
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
31
25
  from vllm.model_executor.layers.rotary_embedding import get_rope
32
26
  from vllm.model_executor.layers.vocab_parallel_embedding import (
33
27
  ParallelLMHead,
@@ -37,8 +31,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
37
31
 
38
32
  from sglang.srt.layers.activation import SiluAndMul
39
33
  from sglang.srt.layers.layernorm import RMSNorm
34
+ from sglang.srt.layers.linear import (
35
+ MergedColumnParallelLinear,
36
+ QKVParallelLinear,
37
+ RowParallelLinear,
38
+ )
40
39
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
40
  from sglang.srt.layers.pooler import Pooler, PoolingType
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -29,13 +29,6 @@ from vllm.distributed import (
29
29
  tensor_model_parallel_all_reduce,
30
30
  )
31
31
  from vllm.model_executor.layers.fused_moe import FusedMoE
32
- from vllm.model_executor.layers.linear import (
33
- MergedColumnParallelLinear,
34
- QKVParallelLinear,
35
- ReplicatedLinear,
36
- RowParallelLinear,
37
- )
38
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
39
32
  from vllm.model_executor.layers.rotary_embedding import get_rope
40
33
  from vllm.model_executor.layers.vocab_parallel_embedding import (
41
34
  ParallelLMHead,
@@ -45,7 +38,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
45
38
 
46
39
  from sglang.srt.layers.activation import SiluAndMul
47
40
  from sglang.srt.layers.layernorm import RMSNorm
41
+ from sglang.srt.layers.linear import (
42
+ MergedColumnParallelLinear,
43
+ QKVParallelLinear,
44
+ ReplicatedLinear,
45
+ RowParallelLinear,
46
+ )
48
47
  from sglang.srt.layers.logits_processor import LogitsProcessor
48
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
49
49
  from sglang.srt.layers.radix_attention import RadixAttention
50
50
  from sglang.srt.layers.torchao_utils import apply_torchao_config_
51
51
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -24,12 +24,6 @@ from torch import nn
24
24
  from transformers import PretrainedConfig
25
25
  from vllm.config import CacheConfig
26
26
  from vllm.distributed import get_tensor_model_parallel_world_size
27
- from vllm.model_executor.layers.linear import (
28
- MergedColumnParallelLinear,
29
- QKVParallelLinear,
30
- RowParallelLinear,
31
- )
32
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
33
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
34
28
  from vllm.model_executor.layers.vocab_parallel_embedding import (
35
29
  ParallelLMHead,
@@ -38,7 +32,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
38
32
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
39
33
 
40
34
  from sglang.srt.layers.activation import SiluAndMul
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
41
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
42
  from sglang.srt.layers.radix_attention import RadixAttention
43
43
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
44
44
 
@@ -31,7 +31,6 @@ from vllm.model_executor.layers.linear import (
31
31
  QKVParallelLinear,
32
32
  RowParallelLinear,
33
33
  )
34
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
35
34
  from vllm.model_executor.layers.rotary_embedding import get_rope
36
35
  from vllm.model_executor.layers.vocab_parallel_embedding import (
37
36
  ParallelLMHead,
@@ -40,6 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
40
39
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
41
40
 
42
41
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
43
  from sglang.srt.layers.radix_attention import RadixAttention
44
44
  from sglang.srt.model_executor.model_runner import InputMetadata
45
45
 
@@ -34,7 +34,6 @@ from vllm.model_executor.layers.linear import (
34
34
  ReplicatedLinear,
35
35
  RowParallelLinear,
36
36
  )
37
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
38
37
  from vllm.model_executor.layers.rotary_embedding import get_rope
39
38
  from vllm.model_executor.layers.vocab_parallel_embedding import (
40
39
  ParallelLMHead,
@@ -43,6 +42,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
43
42
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
44
43
 
45
44
  from sglang.srt.layers.logits_processor import LogitsProcessor
45
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
46
46
  from sglang.srt.layers.radix_attention import RadixAttention
47
47
  from sglang.srt.model_executor.forward_batch_info import InputMetadata
48
48
 
sglang/srt/models/yivl.py CHANGED
@@ -21,9 +21,9 @@ import torch
21
21
  import torch.nn as nn
22
22
  from transformers import CLIPVisionModel, LlavaConfig
23
23
  from vllm.config import CacheConfig
24
- from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
25
24
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
26
25
 
26
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
27
27
  from sglang.srt.models.llava import LlavaLlamaForCausalLM
28
28
 
29
29
 
@@ -858,11 +858,18 @@ def v1_chat_generate_request(
858
858
  openai_compatible_messages.append(
859
859
  {"role": message.role, "content": content["text"]}
860
860
  )
861
+ if openai_compatible_messages[-1]["role"] == "assistant":
862
+ assistant_prefix = openai_compatible_messages[-1]["content"]
863
+ openai_compatible_messages = openai_compatible_messages[:-1]
864
+ else:
865
+ assistant_prefix = None
861
866
  prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
862
867
  openai_compatible_messages,
863
868
  tokenize=True,
864
869
  add_generation_prompt=True,
865
870
  )
871
+ if assistant_prefix:
872
+ prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
866
873
  stop = request.stop
867
874
  image_data = None
868
875
  modalities = []
sglang/srt/utils.py CHANGED
@@ -26,7 +26,7 @@ import struct
26
26
  import time
27
27
  from importlib.metadata import PackageNotFoundError, version
28
28
  from io import BytesIO
29
- from typing import List, Optional, Union
29
+ from typing import Any, Dict, List, Optional, Union
30
30
 
31
31
  import numpy as np
32
32
  import psutil
@@ -682,3 +682,23 @@ def replace_submodule(
682
682
  target_name = module_name.split(".")[-1]
683
683
  setattr(parent, target_name, new_module)
684
684
  return new_module
685
+
686
+
687
+ def set_weight_attrs(
688
+ weight: torch.Tensor,
689
+ weight_attrs: Optional[Dict[str, Any]],
690
+ ):
691
+ """Set attributes on a weight tensor.
692
+
693
+ This method is used to set attributes on a weight tensor. This method
694
+ will not overwrite existing attributes.
695
+
696
+ Args:
697
+ weight: The weight tensor.
698
+ weight_attrs: A dictionary of attributes to set on the weight tensor.
699
+ """
700
+ if weight_attrs is None:
701
+ return
702
+ for key, value in weight_attrs.items():
703
+ assert not hasattr(weight, key), f"Overwriting existing tensor attribute: {key}"
704
+ setattr(weight, key, value)
sglang/test/runners.py CHANGED
@@ -21,19 +21,19 @@ from typing import List, Union
21
21
 
22
22
  import torch
23
23
  import torch.nn.functional as F
24
- from peft import PeftModel
25
- from transformers import AutoModelForCausalLM, AutoTokenizer
24
+ from transformers import AutoModelForCausalLM
26
25
 
26
+ from sglang.srt.hf_transformers_utils import get_tokenizer
27
27
  from sglang.srt.server import Runtime
28
28
  from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
29
29
 
30
30
  DEFAULT_PROMPTS = [
31
- # the output of gemma-2-2b from SRT is unstable on the commented prompt
32
- # "The capital of France is",
33
31
  "Apple is red. Banana is Yellow. " * 800 + "Apple is",
34
32
  "The capital of the United Kingdom is",
35
33
  "Today is a sunny day and I like",
36
34
  "AI is a field of computer science focused on",
35
+ # the output of gemma-2-2b from SRT is unstable on the commented prompt
36
+ # "The capital of France is",
37
37
  ]
38
38
 
39
39
  dirpath = os.path.dirname(__file__)
@@ -93,11 +93,7 @@ class HFRunner:
93
93
  self.model_proc.start()
94
94
 
95
95
  def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
96
- self.tokenizer = AutoTokenizer.from_pretrained(
97
- model_path,
98
- torch_dtype=torch_dtype,
99
- )
100
-
96
+ self.tokenizer = get_tokenizer(model_path)
101
97
  if self.is_generation:
102
98
  self.base_model = AutoModelForCausalLM.from_pretrained(
103
99
  model_path,
@@ -132,6 +128,8 @@ class HFRunner:
132
128
  input_ids = torch.tensor([p], device="cuda")
133
129
 
134
130
  if lora_paths is not None and lora_paths[i] is not None:
131
+ from peft import PeftModel
132
+
135
133
  self.model = PeftModel.from_pretrained(
136
134
  self.base_model,
137
135
  lora_paths[i],
sglang/test/test_utils.py CHANGED
@@ -25,11 +25,14 @@ from sglang.utils import get_exception_traceback
25
25
  DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
26
26
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
27
27
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
28
+ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
29
+ DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
28
30
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
29
31
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
30
- DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
32
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
31
33
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
32
- DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
34
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
35
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
33
36
 
34
37
 
35
38
  def is_in_ci():
@@ -585,3 +588,37 @@ def run_bench_latency(model, other_args):
585
588
  kill_child_process(process.pid)
586
589
 
587
590
  return output_throughput
591
+
592
+
593
+ def lcs(X, Y):
594
+ m = len(X)
595
+ n = len(Y)
596
+ L = [[0] * (n + 1) for _ in range(m + 1)]
597
+
598
+ for i in range(m + 1):
599
+ for j in range(n + 1):
600
+ if i == 0 or j == 0:
601
+ L[i][j] = 0
602
+ elif X[i - 1] == Y[j - 1]:
603
+ L[i][j] = L[i - 1][j - 1] + 1
604
+ else:
605
+ L[i][j] = max(L[i - 1][j], L[i][j - 1])
606
+
607
+ return L[m][n]
608
+
609
+
610
+ def calculate_rouge_l(output_strs_list1, output_strs_list2):
611
+ """calculate the ROUGE-L score"""
612
+ rouge_l_scores = []
613
+
614
+ for s1, s2 in zip(output_strs_list1, output_strs_list2):
615
+ lcs_len = lcs(s1, s2)
616
+ precision = lcs_len / len(s1) if len(s1) > 0 else 0
617
+ recall = lcs_len / len(s2) if len(s2) > 0 else 0
618
+ if precision + recall > 0:
619
+ fmeasure = (2 * precision * recall) / (precision + recall)
620
+ else:
621
+ fmeasure = 0.0
622
+ rouge_l_scores.append(fmeasure)
623
+
624
+ return rouge_l_scores
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1.post2"
1
+ __version__ = "0.3.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1.post2
3
+ Version: 0.3.2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
318
  ### Method 2: From source
319
319
  ```
320
320
  # Use the last release branch
321
- git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.2 https://github.com/sgl-project/sglang.git
322
322
  cd sglang
323
323
 
324
324
  pip install --upgrade pip
@@ -348,9 +348,9 @@ docker run --gpus all \
348
348
  <summary>More</summary>
349
349
 
350
350
  > This method is recommended if you plan to serve it as a service.
351
- > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
351
+ > A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
352
352
 
353
- 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
353
+ 1. Copy the [compose.yml](docker/compose.yaml) to your local machine
354
354
  2. Execute the command `docker compose up -d` in your terminal.
355
355
  </details>
356
356
 
@@ -499,6 +499,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
499
499
  - Llama / Llama 2 / Llama 3 / Llama 3.1
500
500
  - Mistral / Mixtral / Mistral NeMo
501
501
  - Gemma / Gemma 2
502
+ - OLMoE
502
503
  - Qwen / Qwen 2 / Qwen 2 MoE
503
504
  - DeepSeek / DeepSeek 2
504
505
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -520,6 +521,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
520
521
  - BaiChuan2
521
522
  - MiniCPM / MiniCPM 3
522
523
  - XVERSE / XVERSE MoE
524
+ - SmolLM
523
525
 
524
526
 
525
527
  **Embedding Models**
@@ -528,7 +530,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
528
530
  - gte-Qwen2
529
531
  - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
530
532
 
531
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
533
+ Instructions for supporting a new model are [here](docs/en/model_support.md).
532
534
 
533
535
  #### Use Models From ModelScope
534
536
  <details>
@@ -823,7 +825,7 @@ def chat_example(s):
823
825
  Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
824
826
 
825
827
  ## Roadmap
826
- [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
828
+ [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
827
829
 
828
830
  ## Citation And Acknowledgment
829
831
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.