sglang 0.4.1.post6__py3-none-any.whl → 0.4.1.post7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. sglang/__init__.py +21 -23
  2. sglang/api.py +2 -7
  3. sglang/bench_offline_throughput.py +24 -16
  4. sglang/bench_one_batch.py +51 -3
  5. sglang/bench_one_batch_server.py +1 -1
  6. sglang/bench_serving.py +37 -28
  7. sglang/lang/backend/runtime_endpoint.py +183 -4
  8. sglang/lang/chat_template.py +15 -4
  9. sglang/launch_server.py +1 -1
  10. sglang/srt/_custom_ops.py +80 -42
  11. sglang/srt/configs/device_config.py +1 -1
  12. sglang/srt/configs/model_config.py +1 -0
  13. sglang/srt/constrained/base_grammar_backend.py +21 -0
  14. sglang/srt/constrained/xgrammar_backend.py +8 -4
  15. sglang/srt/conversation.py +14 -1
  16. sglang/srt/distributed/__init__.py +3 -3
  17. sglang/srt/distributed/communication_op.py +2 -1
  18. sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
  19. sglang/srt/distributed/device_communicators/custom_all_reduce.py +107 -40
  20. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  21. sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
  22. sglang/srt/distributed/device_communicators/pynccl.py +80 -1
  23. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
  24. sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
  25. sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
  26. sglang/srt/distributed/parallel_state.py +1 -1
  27. sglang/srt/distributed/utils.py +2 -1
  28. sglang/srt/entrypoints/engine.py +449 -0
  29. sglang/srt/entrypoints/http_server.py +579 -0
  30. sglang/srt/layers/activation.py +3 -3
  31. sglang/srt/layers/attention/flashinfer_backend.py +10 -9
  32. sglang/srt/layers/attention/triton_backend.py +4 -6
  33. sglang/srt/layers/attention/vision.py +204 -0
  34. sglang/srt/layers/dp_attention.py +69 -0
  35. sglang/srt/layers/linear.py +41 -5
  36. sglang/srt/layers/logits_processor.py +48 -63
  37. sglang/srt/layers/moe/ep_moe/layer.py +4 -4
  38. sglang/srt/layers/moe/fused_moe_native.py +69 -0
  39. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -6
  40. sglang/srt/layers/moe/fused_moe_triton/layer.py +29 -5
  41. sglang/srt/layers/parameter.py +2 -1
  42. sglang/srt/layers/quantization/__init__.py +20 -23
  43. sglang/srt/layers/quantization/fp8.py +6 -3
  44. sglang/srt/layers/quantization/modelopt_quant.py +1 -2
  45. sglang/srt/layers/quantization/w8a8_int8.py +1 -1
  46. sglang/srt/layers/radix_attention.py +2 -2
  47. sglang/srt/layers/rotary_embedding.py +1179 -31
  48. sglang/srt/layers/sampler.py +39 -1
  49. sglang/srt/layers/vocab_parallel_embedding.py +2 -2
  50. sglang/srt/lora/lora.py +1 -9
  51. sglang/srt/managers/configure_logging.py +3 -0
  52. sglang/srt/managers/data_parallel_controller.py +79 -72
  53. sglang/srt/managers/detokenizer_manager.py +23 -6
  54. sglang/srt/managers/image_processor.py +158 -2
  55. sglang/srt/managers/io_struct.py +25 -2
  56. sglang/srt/managers/schedule_batch.py +49 -22
  57. sglang/srt/managers/schedule_policy.py +26 -12
  58. sglang/srt/managers/scheduler.py +277 -178
  59. sglang/srt/managers/session_controller.py +1 -0
  60. sglang/srt/managers/tokenizer_manager.py +206 -121
  61. sglang/srt/managers/tp_worker.py +6 -4
  62. sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
  63. sglang/srt/managers/utils.py +44 -0
  64. sglang/srt/mem_cache/memory_pool.py +10 -32
  65. sglang/srt/metrics/collector.py +15 -6
  66. sglang/srt/model_executor/cuda_graph_runner.py +4 -6
  67. sglang/srt/model_executor/model_runner.py +37 -15
  68. sglang/srt/model_loader/loader.py +8 -6
  69. sglang/srt/model_loader/weight_utils.py +55 -2
  70. sglang/srt/models/baichuan.py +6 -6
  71. sglang/srt/models/chatglm.py +2 -2
  72. sglang/srt/models/commandr.py +3 -3
  73. sglang/srt/models/dbrx.py +4 -4
  74. sglang/srt/models/deepseek.py +3 -3
  75. sglang/srt/models/deepseek_v2.py +8 -8
  76. sglang/srt/models/exaone.py +2 -2
  77. sglang/srt/models/gemma.py +2 -2
  78. sglang/srt/models/gemma2.py +6 -24
  79. sglang/srt/models/gpt2.py +3 -5
  80. sglang/srt/models/gpt_bigcode.py +1 -1
  81. sglang/srt/models/granite.py +2 -2
  82. sglang/srt/models/grok.py +3 -3
  83. sglang/srt/models/internlm2.py +2 -2
  84. sglang/srt/models/llama.py +7 -5
  85. sglang/srt/models/minicpm.py +2 -2
  86. sglang/srt/models/minicpm3.py +6 -6
  87. sglang/srt/models/minicpmv.py +1238 -0
  88. sglang/srt/models/mixtral.py +3 -3
  89. sglang/srt/models/mixtral_quant.py +3 -3
  90. sglang/srt/models/mllama.py +2 -2
  91. sglang/srt/models/olmo.py +3 -3
  92. sglang/srt/models/olmo2.py +4 -4
  93. sglang/srt/models/olmoe.py +7 -13
  94. sglang/srt/models/phi3_small.py +2 -2
  95. sglang/srt/models/qwen.py +2 -2
  96. sglang/srt/models/qwen2.py +41 -4
  97. sglang/srt/models/qwen2_moe.py +3 -3
  98. sglang/srt/models/qwen2_vl.py +22 -122
  99. sglang/srt/models/stablelm.py +2 -2
  100. sglang/srt/models/torch_native_llama.py +3 -3
  101. sglang/srt/models/xverse.py +6 -6
  102. sglang/srt/models/xverse_moe.py +6 -6
  103. sglang/srt/openai_api/protocol.py +2 -0
  104. sglang/srt/sampling/custom_logit_processor.py +38 -0
  105. sglang/srt/sampling/sampling_batch_info.py +139 -4
  106. sglang/srt/sampling/sampling_params.py +3 -1
  107. sglang/srt/server.py +4 -1090
  108. sglang/srt/server_args.py +57 -14
  109. sglang/srt/utils.py +103 -65
  110. sglang/test/runners.py +8 -13
  111. sglang/test/test_programs.py +1 -1
  112. sglang/test/test_utils.py +3 -1
  113. sglang/utils.py +12 -2
  114. sglang/version.py +1 -1
  115. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/METADATA +16 -5
  116. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/RECORD +119 -115
  117. sglang/launch_server_llavavid.py +0 -25
  118. sglang/srt/constrained/__init__.py +0 -16
  119. sglang/srt/distributed/device_communicators/__init__.py +0 -0
  120. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/LICENSE +0 -0
  121. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/WHEEL +0 -0
  122. {sglang-0.4.1.post6.dist-info → sglang-0.4.1.post7.dist-info}/top_level.txt +0 -0
@@ -20,9 +20,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
20
20
 
21
21
  import torch
22
22
  from torch import nn
23
- from vllm.distributed import get_tensor_model_parallel_world_size
24
- from vllm.model_executor.layers.rotary_embedding import get_rope
25
23
 
24
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
26
25
  from sglang.srt.layers.activation import SiluAndMul
27
26
  from sglang.srt.layers.layernorm import RMSNorm
28
27
  from sglang.srt.layers.linear import (
@@ -33,6 +32,7 @@ from sglang.srt.layers.linear import (
33
32
  from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
34
33
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
35
34
  from sglang.srt.layers.radix_attention import RadixAttention
35
+ from sglang.srt.layers.rotary_embedding import get_rope
36
36
  from sglang.srt.layers.vocab_parallel_embedding import (
37
37
  ParallelLMHead,
38
38
  VocabParallelEmbedding,
@@ -21,9 +21,8 @@ from typing import Iterable, Optional, Tuple
21
21
  import torch
22
22
  from torch import nn
23
23
  from transformers import PretrainedConfig
24
- from vllm.distributed import get_tensor_model_parallel_world_size
25
- from vllm.model_executor.layers.rotary_embedding import get_rope
26
24
 
25
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
27
26
  from sglang.srt.layers.activation import GeluAndMul
28
27
  from sglang.srt.layers.layernorm import RMSNorm
29
28
  from sglang.srt.layers.linear import (
@@ -34,6 +33,7 @@ from sglang.srt.layers.linear import (
34
33
  from sglang.srt.layers.logits_processor import LogitsProcessor
35
34
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
36
35
  from sglang.srt.layers.radix_attention import RadixAttention
36
+ from sglang.srt.layers.rotary_embedding import get_rope
37
37
  from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
38
38
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
39
39
  from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -15,13 +15,13 @@
15
15
  # Adapted from:
16
16
  # https://github.com/vllm-project/vllm/blob/56b325e977435af744f8b3dca7af0ca209663558/vllm/model_executor/models/gemma2.py
17
17
 
18
- from typing import Iterable, Optional, Set, Tuple, Union
18
+ from typing import Iterable, Optional, Set, Tuple
19
19
 
20
20
  import torch
21
21
  from torch import nn
22
22
  from transformers import PretrainedConfig
23
- from vllm.distributed import get_tensor_model_parallel_world_size
24
23
 
24
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
25
25
  from sglang.srt.layers.activation import GeluAndMul
26
26
  from sglang.srt.layers.layernorm import GemmaRMSNorm
27
27
  from sglang.srt.layers.linear import (
@@ -32,6 +32,7 @@ from sglang.srt.layers.linear import (
32
32
  from sglang.srt.layers.logits_processor import LogitsProcessor
33
33
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
34
34
  from sglang.srt.layers.radix_attention import RadixAttention
35
+ from sglang.srt.layers.rotary_embedding import get_rope
35
36
  from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
36
37
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
37
38
  from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -44,23 +45,6 @@ def get_attention_sliding_window_size(config):
44
45
  return config.sliding_window - 1
45
46
 
46
47
 
47
- # FIXME: temporary solution, remove after next vllm release
48
- from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
49
-
50
-
51
- class GemmaRotaryEmbedding(RotaryEmbedding):
52
- def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
53
- # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/gemma/modeling_gemma.py#L107
54
- inv_freq = 1.0 / (
55
- base
56
- ** (
57
- torch.arange(0, self.rotary_dim, 2, dtype=torch.int64).float()
58
- / self.rotary_dim
59
- )
60
- )
61
- return inv_freq
62
-
63
-
64
48
  class Gemma2MLP(nn.Module):
65
49
  def __init__(
66
50
  self,
@@ -143,14 +127,12 @@ class Gemma2Attention(nn.Module):
143
127
  bias=config.attention_bias,
144
128
  quant_config=quant_config,
145
129
  )
146
- # from vLLM: TODO(woosuk): Use the `get_rope` interface.
147
- self.rotary_emb = GemmaRotaryEmbedding(
148
- self.head_dim,
130
+ self.rotary_emb = get_rope(
149
131
  self.head_dim,
150
- max_position_embeddings,
132
+ rotary_dim=self.head_dim,
133
+ max_position=max_position_embeddings,
151
134
  base=self.rope_theta,
152
135
  is_neox_style=True,
153
- dtype=torch.get_default_dtype(),
154
136
  )
155
137
 
156
138
  use_sliding_window = layer_id % 2 == 0 and hasattr(config, "sliding_window")
sglang/srt/models/gpt2.py CHANGED
@@ -17,16 +17,14 @@
17
17
  # See the License for the specific language governing permissions and
18
18
  # limitations under the License.
19
19
  """Inference-only GPT-2 model compatible with HuggingFace weights."""
20
- from typing import Iterable, List, Optional, Tuple
20
+ from typing import Iterable, Optional, Tuple
21
21
 
22
22
  import torch
23
23
  from torch import nn
24
24
  from transformers import GPT2Config
25
- from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.activation import get_act_fn
27
- from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
28
25
 
29
- # from sglang.srt.layers.activation import get_act_fn
26
+ from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size
27
+ from sglang.srt.layers.activation import get_act_fn
30
28
  from sglang.srt.layers.linear import (
31
29
  ColumnParallelLinear,
32
30
  QKVParallelLinear,
@@ -21,8 +21,8 @@ from typing import Iterable, Optional, Tuple
21
21
  import torch
22
22
  from torch import nn
23
23
  from transformers import GPTBigCodeConfig
24
- from vllm.distributed import get_tensor_model_parallel_world_size
25
24
 
25
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
26
26
  from sglang.srt.layers.activation import get_act_fn
27
27
  from sglang.srt.layers.linear import (
28
28
  ColumnParallelLinear,
@@ -22,9 +22,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
22
22
  import torch
23
23
  from torch import nn
24
24
  from transformers import GraniteConfig
25
- from vllm.distributed import get_tensor_model_parallel_world_size
26
- from vllm.model_executor.layers.rotary_embedding import get_rope
27
25
 
26
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
28
27
  from sglang.srt.layers.activation import SiluAndMul
29
28
  from sglang.srt.layers.layernorm import RMSNorm
30
29
  from sglang.srt.layers.linear import (
@@ -36,6 +35,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
36
35
  from sglang.srt.layers.pooler import Pooler, PoolingType
37
36
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
38
37
  from sglang.srt.layers.radix_attention import RadixAttention
38
+ from sglang.srt.layers.rotary_embedding import get_rope
39
39
  from sglang.srt.layers.vocab_parallel_embedding import (
40
40
  ParallelLMHead,
41
41
  VocabParallelEmbedding,
sglang/srt/models/grok.py CHANGED
@@ -22,12 +22,11 @@ import torch
22
22
  import torch.nn.functional as F
23
23
  from torch import nn
24
24
  from transformers import PretrainedConfig
25
- from vllm.distributed import (
25
+
26
+ from sglang.srt.distributed import (
26
27
  get_tensor_model_parallel_rank,
27
28
  get_tensor_model_parallel_world_size,
28
29
  )
29
- from vllm.model_executor.layers.rotary_embedding import get_rope
30
-
31
30
  from sglang.srt.layers.activation import GeluAndMul
32
31
  from sglang.srt.layers.layernorm import RMSNorm
33
32
  from sglang.srt.layers.linear import (
@@ -40,6 +39,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
40
39
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
41
40
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
41
  from sglang.srt.layers.radix_attention import RadixAttention
42
+ from sglang.srt.layers.rotary_embedding import get_rope
43
43
  from sglang.srt.layers.vocab_parallel_embedding import (
44
44
  ParallelLMHead,
45
45
  VocabParallelEmbedding,
@@ -19,9 +19,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
19
19
  import torch
20
20
  from torch import nn
21
21
  from transformers import PretrainedConfig
22
- from vllm.distributed import get_tensor_model_parallel_world_size
23
- from vllm.model_executor.layers.rotary_embedding import get_rope
24
22
 
23
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
25
24
  from sglang.srt.layers.activation import SiluAndMul
26
25
  from sglang.srt.layers.layernorm import RMSNorm
27
26
  from sglang.srt.layers.linear import (
@@ -32,6 +31,7 @@ from sglang.srt.layers.linear import (
32
31
  from sglang.srt.layers.logits_processor import LogitsProcessor
33
32
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
34
33
  from sglang.srt.layers.radix_attention import RadixAttention
34
+ from sglang.srt.layers.rotary_embedding import get_rope
35
35
  from sglang.srt.layers.vocab_parallel_embedding import (
36
36
  ParallelLMHead,
37
37
  VocabParallelEmbedding,
@@ -22,13 +22,11 @@ from typing import Any, Dict, Iterable, Optional, Tuple
22
22
  import torch
23
23
  from torch import nn
24
24
  from transformers import LlamaConfig
25
- from vllm.distributed import (
25
+
26
+ from sglang.srt.distributed import (
26
27
  get_tensor_model_parallel_rank,
27
28
  get_tensor_model_parallel_world_size,
28
29
  )
29
- from vllm.model_executor.layers.rotary_embedding import get_rope
30
- from vllm.model_executor.model_loader.weight_utils import kv_cache_scales_loader
31
-
32
30
  from sglang.srt.layers.activation import SiluAndMul
33
31
  from sglang.srt.layers.layernorm import RMSNorm
34
32
  from sglang.srt.layers.linear import (
@@ -40,12 +38,16 @@ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorO
40
38
  from sglang.srt.layers.pooler import Pooler, PoolingType
41
39
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
40
  from sglang.srt.layers.radix_attention import RadixAttention
41
+ from sglang.srt.layers.rotary_embedding import get_rope
43
42
  from sglang.srt.layers.vocab_parallel_embedding import (
44
43
  ParallelLMHead,
45
44
  VocabParallelEmbedding,
46
45
  )
47
46
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
48
- from sglang.srt.model_loader.weight_utils import default_weight_loader
47
+ from sglang.srt.model_loader.weight_utils import (
48
+ default_weight_loader,
49
+ kv_cache_scales_loader,
50
+ )
49
51
  from sglang.srt.utils import make_layers
50
52
  from sglang.utils import get_exception_traceback
51
53
 
@@ -18,9 +18,8 @@ from typing import Any, Dict, Iterable, Optional, Tuple
18
18
 
19
19
  import torch
20
20
  from torch import nn
21
- from vllm.distributed import get_tensor_model_parallel_world_size
22
- from vllm.model_executor.layers.rotary_embedding import get_rope
23
21
 
22
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
24
23
  from sglang.srt.layers.activation import SiluAndMul
25
24
  from sglang.srt.layers.layernorm import RMSNorm
26
25
  from sglang.srt.layers.linear import (
@@ -31,6 +30,7 @@ from sglang.srt.layers.linear import (
31
30
  from sglang.srt.layers.logits_processor import LogitsProcessor
32
31
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
33
32
  from sglang.srt.layers.radix_attention import RadixAttention
33
+ from sglang.srt.layers.rotary_embedding import get_rope
34
34
  from sglang.srt.layers.vocab_parallel_embedding import (
35
35
  ParallelLMHead,
36
36
  VocabParallelEmbedding,
@@ -19,20 +19,20 @@ from typing import Any, Dict, Iterable, Optional, Tuple
19
19
  import torch
20
20
  from torch import nn
21
21
  from transformers import PretrainedConfig
22
- from vllm.distributed import get_tensor_model_parallel_world_size
23
- from vllm.model_executor.layers.linear import (
22
+
23
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
24
+ from sglang.srt.layers.activation import SiluAndMul
25
+ from sglang.srt.layers.layernorm import RMSNorm
26
+ from sglang.srt.layers.linear import (
24
27
  ColumnParallelLinear,
25
28
  MergedColumnParallelLinear,
26
29
  ReplicatedLinear,
27
30
  RowParallelLinear,
28
31
  )
29
- from vllm.model_executor.layers.rotary_embedding import get_rope
30
-
31
- from sglang.srt.layers.activation import SiluAndMul
32
- from sglang.srt.layers.layernorm import RMSNorm
33
32
  from sglang.srt.layers.logits_processor import LogitsProcessor
34
33
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
35
34
  from sglang.srt.layers.radix_attention import RadixAttention
35
+ from sglang.srt.layers.rotary_embedding import get_rope
36
36
  from sglang.srt.layers.vocab_parallel_embedding import (
37
37
  ParallelLMHead,
38
38
  VocabParallelEmbedding,