sglang 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. sglang/__init__.py +55 -2
  2. sglang/api.py +3 -5
  3. sglang/backend/anthropic.py +33 -13
  4. sglang/backend/openai.py +2 -1
  5. sglang/backend/runtime_endpoint.py +18 -5
  6. sglang/backend/vertexai.py +1 -0
  7. sglang/global_config.py +1 -0
  8. sglang/lang/chat_template.py +74 -0
  9. sglang/lang/interpreter.py +40 -16
  10. sglang/lang/ir.py +1 -1
  11. sglang/lang/tracer.py +6 -4
  12. sglang/launch_server.py +2 -1
  13. sglang/srt/constrained/fsm_cache.py +15 -3
  14. sglang/srt/constrained/jump_forward.py +1 -0
  15. sglang/srt/conversation.py +2 -2
  16. sglang/srt/hf_transformers_utils.py +2 -1
  17. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  18. sglang/srt/layers/extend_attention.py +1 -0
  19. sglang/srt/layers/logits_processor.py +114 -54
  20. sglang/srt/layers/radix_attention.py +2 -1
  21. sglang/srt/layers/token_attention.py +1 -0
  22. sglang/srt/managers/detokenizer_manager.py +5 -1
  23. sglang/srt/managers/io_struct.py +12 -0
  24. sglang/srt/managers/router/infer_batch.py +70 -33
  25. sglang/srt/managers/router/manager.py +7 -2
  26. sglang/srt/managers/router/model_rpc.py +116 -73
  27. sglang/srt/managers/router/model_runner.py +121 -155
  28. sglang/srt/managers/router/radix_cache.py +46 -38
  29. sglang/srt/managers/tokenizer_manager.py +56 -11
  30. sglang/srt/memory_pool.py +5 -14
  31. sglang/srt/model_config.py +7 -0
  32. sglang/srt/models/commandr.py +376 -0
  33. sglang/srt/models/dbrx.py +413 -0
  34. sglang/srt/models/dbrx_config.py +281 -0
  35. sglang/srt/models/gemma.py +22 -20
  36. sglang/srt/models/llama2.py +23 -21
  37. sglang/srt/models/llava.py +12 -10
  38. sglang/srt/models/mixtral.py +27 -25
  39. sglang/srt/models/qwen.py +23 -21
  40. sglang/srt/models/qwen2.py +23 -21
  41. sglang/srt/models/stablelm.py +292 -0
  42. sglang/srt/models/yivl.py +6 -5
  43. sglang/srt/openai_api_adapter.py +356 -0
  44. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
  45. sglang/srt/sampling_params.py +2 -0
  46. sglang/srt/server.py +68 -439
  47. sglang/srt/server_args.py +76 -49
  48. sglang/srt/utils.py +88 -32
  49. sglang/srt/weight_utils.py +402 -0
  50. sglang/test/test_programs.py +8 -7
  51. sglang/test/test_utils.py +196 -8
  52. {sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/METADATA +13 -15
  53. sglang-0.1.15.dist-info/RECORD +69 -0
  54. {sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/WHEEL +1 -1
  55. sglang-0.1.13.dist-info/RECORD +0 -63
  56. {sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
  57. {sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0
@@ -1,34 +1,36 @@
1
1
  # Adapted from llama2.py
2
2
  # Modify details for the adaptation of Qwen2 model.
3
3
  """Inference-only Qwen2 model compatible with HuggingFace weights."""
4
- from typing import Any, Dict, List, Optional, Tuple
4
+ from typing import Any, Dict, Optional, Tuple
5
5
 
6
6
  import torch
7
- from sglang.srt.layers.logits_processor import LogitsProcessor
8
- from sglang.srt.layers.radix_attention import RadixAttention
9
- from sglang.srt.managers.router.model_runner import InputMetadata
10
7
  from torch import nn
11
8
  from vllm.model_executor.layers.activation import SiluAndMul
12
9
  from vllm.model_executor.layers.layernorm import RMSNorm
13
10
  from vllm.model_executor.layers.linear import (
14
- LinearMethodBase,
15
11
  MergedColumnParallelLinear,
16
12
  QKVParallelLinear,
17
13
  RowParallelLinear,
18
14
  )
15
+ from vllm.model_executor.layers.quantization.base_config import (
16
+ QuantizationConfig)
19
17
  from vllm.model_executor.layers.rotary_embedding import get_rope
20
18
  from vllm.model_executor.layers.vocab_parallel_embedding import (
21
19
  ParallelLMHead,
22
20
  VocabParallelEmbedding,
23
21
  )
24
- from vllm.model_executor.parallel_utils.parallel_state import (
22
+ from vllm.distributed import (
25
23
  get_tensor_model_parallel_world_size,
26
24
  )
27
- from vllm.model_executor.weight_utils import (
25
+ from sglang.srt.weight_utils import (
28
26
  default_weight_loader,
29
27
  hf_model_weights_iterator,
30
28
  )
31
29
 
30
+ from sglang.srt.layers.logits_processor import LogitsProcessor
31
+ from sglang.srt.layers.radix_attention import RadixAttention
32
+ from sglang.srt.managers.router.model_runner import InputMetadata
33
+
32
34
  Qwen2Config = None
33
35
 
34
36
 
@@ -38,17 +40,17 @@ class Qwen2MLP(nn.Module):
38
40
  hidden_size: int,
39
41
  intermediate_size: int,
40
42
  hidden_act: str,
41
- linear_method: Optional[LinearMethodBase] = None,
43
+ quant_config: Optional[QuantizationConfig] = None,
42
44
  ) -> None:
43
45
  super().__init__()
44
46
  self.gate_up_proj = MergedColumnParallelLinear(
45
47
  hidden_size,
46
48
  [intermediate_size] * 2,
47
49
  bias=False,
48
- linear_method=linear_method,
50
+ quant_config=quant_config,
49
51
  )
50
52
  self.down_proj = RowParallelLinear(
51
- intermediate_size, hidden_size, bias=False, linear_method=linear_method
53
+ intermediate_size, hidden_size, bias=False, quant_config=quant_config,
52
54
  )
53
55
  if hidden_act != "silu":
54
56
  raise ValueError(
@@ -74,7 +76,7 @@ class Qwen2Attention(nn.Module):
74
76
  rope_theta: float = 1000000,
75
77
  rope_scaling: Optional[Dict[str, Any]] = None,
76
78
  max_position_embeddings: int = 32768,
77
- linear_method: Optional[LinearMethodBase] = None,
79
+ quant_config: Optional[QuantizationConfig] = None,
78
80
  ) -> None:
79
81
  super().__init__()
80
82
  self.hidden_size = hidden_size
@@ -105,13 +107,13 @@ class Qwen2Attention(nn.Module):
105
107
  self.total_num_heads,
106
108
  self.total_num_kv_heads,
107
109
  bias=True,
108
- linear_method=linear_method,
110
+ quant_config=quant_config,
109
111
  )
110
112
  self.o_proj = RowParallelLinear(
111
113
  self.total_num_heads * self.head_dim,
112
114
  hidden_size,
113
115
  bias=False,
114
- linear_method=linear_method,
116
+ quant_config=quant_config,
115
117
  )
116
118
 
117
119
  self.rotary_emb = get_rope(
@@ -148,7 +150,7 @@ class Qwen2DecoderLayer(nn.Module):
148
150
  self,
149
151
  config: Qwen2Config,
150
152
  layer_id: int = 0,
151
- linear_method: Optional[LinearMethodBase] = None,
153
+ quant_config: Optional[QuantizationConfig] = None,
152
154
  ) -> None:
153
155
  super().__init__()
154
156
  self.hidden_size = config.hidden_size
@@ -163,13 +165,13 @@ class Qwen2DecoderLayer(nn.Module):
163
165
  rope_theta=rope_theta,
164
166
  rope_scaling=rope_scaling,
165
167
  max_position_embeddings=max_position_embeddings,
166
- linear_method=linear_method,
168
+ quant_config=quant_config,
167
169
  )
168
170
  self.mlp = Qwen2MLP(
169
171
  hidden_size=self.hidden_size,
170
172
  intermediate_size=config.intermediate_size,
171
173
  hidden_act=config.hidden_act,
172
- linear_method=linear_method,
174
+ quant_config=quant_config,
173
175
  )
174
176
  self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
175
177
  self.post_attention_layernorm = RMSNorm(
@@ -205,7 +207,7 @@ class Qwen2Model(nn.Module):
205
207
  def __init__(
206
208
  self,
207
209
  config: Qwen2Config,
208
- linear_method: Optional[LinearMethodBase] = None,
210
+ quant_config: Optional[QuantizationConfig] = None,
209
211
  ) -> None:
210
212
  super().__init__()
211
213
  self.config = config
@@ -217,7 +219,7 @@ class Qwen2Model(nn.Module):
217
219
  )
218
220
  self.layers = nn.ModuleList(
219
221
  [
220
- Qwen2DecoderLayer(config, i, linear_method)
222
+ Qwen2DecoderLayer(config, i, quant_config=quant_config)
221
223
  for i in range(config.num_hidden_layers)
222
224
  ]
223
225
  )
@@ -251,12 +253,12 @@ class Qwen2ForCausalLM(nn.Module):
251
253
  def __init__(
252
254
  self,
253
255
  config: Qwen2Config,
254
- linear_method: Optional[LinearMethodBase] = None,
256
+ quant_config: Optional[QuantizationConfig] = None,
255
257
  ) -> None:
256
258
  super().__init__()
257
259
  self.config = config
258
- self.linear_method = linear_method
259
- self.model = Qwen2Model(config, linear_method)
260
+ self.quant_config = quant_config
261
+ self.model = Qwen2Model(config, quant_config=quant_config)
260
262
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
261
263
  self.logits_processor = LogitsProcessor(config)
262
264
 
@@ -0,0 +1,292 @@
1
+ # This code is based on:
2
+ # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/stablelm.py
3
+ """Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
4
+ model compatible with HuggingFace weights."""
5
+ from typing import Optional, Tuple
6
+
7
+ import torch
8
+ from torch import nn
9
+ from transformers import PretrainedConfig
10
+ from vllm.model_executor.layers.activation import SiluAndMul
11
+ from vllm.model_executor.layers.linear import (
12
+ MergedColumnParallelLinear,
13
+ QKVParallelLinear,
14
+ RowParallelLinear,
15
+ )
16
+ from vllm.model_executor.layers.quantization.base_config import (
17
+ QuantizationConfig)
18
+ from vllm.model_executor.layers.rotary_embedding import get_rope
19
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
20
+ ParallelLMHead,
21
+ VocabParallelEmbedding,
22
+ )
23
+ from vllm.distributed import (
24
+ get_tensor_model_parallel_world_size,
25
+ )
26
+ from sglang.srt.weight_utils import (
27
+ default_weight_loader,
28
+ hf_model_weights_iterator,
29
+ )
30
+
31
+ from sglang.srt.layers.logits_processor import LogitsProcessor
32
+ from sglang.srt.layers.radix_attention import RadixAttention
33
+ from sglang.srt.managers.router.model_runner import InputMetadata
34
+
35
+
36
+ class StablelmMLP(nn.Module):
37
+ def __init__(
38
+ self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None,
39
+ ) -> None:
40
+ super().__init__()
41
+ self.config = config
42
+ self.hidden_size = config.hidden_size
43
+ self.intermediate_size = config.intermediate_size
44
+ self.gate_up_proj = MergedColumnParallelLinear(
45
+ config.hidden_size,
46
+ [config.intermediate_size] * 2,
47
+ bias=False,
48
+ quant_config=quant_config,
49
+ )
50
+ self.down_proj = RowParallelLinear(
51
+ config.intermediate_size, config.hidden_size, bias=False, quant_config=quant_config,
52
+ )
53
+ self.act_fn = SiluAndMul()
54
+
55
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
56
+ gate_up, _ = self.gate_up_proj(x)
57
+ x = self.act_fn(gate_up)
58
+ x, _ = self.down_proj(x)
59
+ return x
60
+
61
+
62
+ class StablelmAttention(nn.Module):
63
+ def __init__(
64
+ self,
65
+ config: PretrainedConfig,
66
+ layer_id: int = 0,
67
+ quant_config: Optional[QuantizationConfig] = None,
68
+ ) -> None:
69
+ super().__init__()
70
+ self.config = config
71
+ self.hidden_size = config.hidden_size
72
+ tp_size = get_tensor_model_parallel_world_size()
73
+ self.total_num_heads = config.num_attention_heads
74
+ self.num_heads = self.total_num_heads // tp_size
75
+
76
+ self.total_num_key_value_heads = config.num_key_value_heads
77
+ if self.total_num_key_value_heads >= tp_size:
78
+ # Number of KV heads is greater than TP size, so we partition
79
+ # the KV heads across multiple tensor parallel GPUs.
80
+ assert self.total_num_key_value_heads % tp_size == 0
81
+ else:
82
+ # Number of KV heads is less than TP size, so we replicate
83
+ # the KV heads across multiple tensor parallel GPUs.
84
+ assert tp_size % self.total_num_key_value_heads == 0
85
+ self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
86
+ self.head_dim = self.hidden_size // self.total_num_heads
87
+ self.max_position_embeddings = config.max_position_embeddings
88
+ rope_pct = getattr(
89
+ config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
90
+ )
91
+ self.rotary_ndims = int(self.head_dim * rope_pct)
92
+ self.scaling = self.head_dim**-0.5
93
+ self.q_size = self.num_heads * self.head_dim
94
+ self.kv_size = self.num_key_value_heads * self.head_dim
95
+ self.qkv_bias = getattr(config, "use_qkv_bias", False)
96
+ if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
97
+ raise ValueError(
98
+ f"hidden_size must be divisible by num_heads "
99
+ f"(got `hidden_size`: {self.hidden_size}"
100
+ f" and `num_heads`: {self.num_heads})."
101
+ )
102
+
103
+ self.qkv_proj = QKVParallelLinear(
104
+ self.hidden_size,
105
+ self.head_dim,
106
+ self.total_num_heads,
107
+ self.total_num_key_value_heads,
108
+ self.qkv_bias,
109
+ )
110
+ self.o_proj = RowParallelLinear(
111
+ self.total_num_heads * self.head_dim,
112
+ self.hidden_size,
113
+ bias=False,
114
+ )
115
+ self.rotary_emb = get_rope(
116
+ self.head_dim,
117
+ rotary_dim=self.rotary_ndims,
118
+ max_position=self.config.max_position_embeddings,
119
+ base=self.config.rope_theta,
120
+ )
121
+ self.attn = RadixAttention(
122
+ self.num_heads,
123
+ self.head_dim,
124
+ self.scaling,
125
+ num_kv_heads=self.num_key_value_heads,
126
+ layer_id=layer_id,
127
+ )
128
+
129
+ def forward(
130
+ self,
131
+ positions: torch.Tensor,
132
+ hidden_states: torch.Tensor,
133
+ input_metadata: InputMetadata,
134
+ ) -> torch.Tensor:
135
+ qkv, _ = self.qkv_proj(hidden_states)
136
+ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
137
+ q, k = self.rotary_emb(positions, q, k)
138
+ attn_output = self.attn(q, k, v, input_metadata)
139
+ output, _ = self.o_proj(attn_output)
140
+ return output
141
+
142
+
143
+ class StablelmDecoderLayer(nn.Module):
144
+ def __init__(
145
+ self,
146
+ config: PretrainedConfig,
147
+ layer_id: int = 0,
148
+ quant_config: Optional[QuantizationConfig] = None,
149
+ ) -> None:
150
+ super().__init__()
151
+ self.self_attn = StablelmAttention(config, layer_id=layer_id)
152
+ self.mlp = StablelmMLP(config, quant_config=quant_config)
153
+ norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
154
+ self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
155
+ self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
156
+
157
+ def forward(
158
+ self,
159
+ positions: torch.Tensor,
160
+ hidden_states: torch.Tensor,
161
+ input_metadata: InputMetadata,
162
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
163
+ # Self Attention
164
+ residual = hidden_states
165
+ hidden_states = self.input_layernorm(hidden_states)
166
+ hidden_states = self.self_attn(
167
+ positions=positions,
168
+ hidden_states=hidden_states,
169
+ input_metadata=input_metadata,
170
+ )
171
+ hidden_states = residual + hidden_states
172
+
173
+ # Fully Connected
174
+ residual = hidden_states
175
+ hidden_states = self.post_attention_layernorm(hidden_states)
176
+ hidden_states = self.mlp(hidden_states)
177
+ hidden_states = residual + hidden_states
178
+
179
+ return hidden_states, residual
180
+
181
+
182
+ class StableLMEpochModel(nn.Module):
183
+ def __init__(
184
+ self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None,
185
+ ) -> None:
186
+ super().__init__()
187
+ self.embed_tokens = VocabParallelEmbedding(
188
+ config.vocab_size,
189
+ config.hidden_size,
190
+ )
191
+ self.layers = nn.ModuleList(
192
+ [
193
+ StablelmDecoderLayer(config, i, quant_config=quant_config)
194
+ for i in range(config.num_hidden_layers)
195
+ ]
196
+ )
197
+ norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
198
+ self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
199
+
200
+ def forward(
201
+ self,
202
+ input_ids: torch.Tensor,
203
+ positions: torch.Tensor,
204
+ input_metadata: InputMetadata,
205
+ input_embeds: torch.Tensor = None,
206
+ ) -> torch.Tensor:
207
+ if input_embeds is None:
208
+ hidden_states = self.embed_tokens(input_ids)
209
+ else:
210
+ hidden_states = input_embeds
211
+ for i in range(len(self.layers)):
212
+ layer = self.layers[i]
213
+ hidden_states, residual = layer(
214
+ positions,
215
+ hidden_states,
216
+ input_metadata,
217
+ )
218
+ hidden_states = self.norm(hidden_states)
219
+ return hidden_states
220
+
221
+
222
+ class StableLmForCausalLM(nn.Module):
223
+ def __init__(
224
+ self,
225
+ config: PretrainedConfig,
226
+ quant_config: Optional[QuantizationConfig] = None,
227
+ ) -> None:
228
+ super().__init__()
229
+ self.config = config
230
+ self.quant_config = quant_config
231
+ self.model = StableLMEpochModel(config, quant_config=quant_config)
232
+ self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
233
+ self.logits_processor = LogitsProcessor(config)
234
+
235
+ def forward(
236
+ self,
237
+ input_ids: torch.Tensor,
238
+ positions: torch.Tensor,
239
+ input_metadata: InputMetadata,
240
+ input_embeds: torch.Tensor = None,
241
+ ) -> torch.Tensor:
242
+ hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
243
+ return self.logits_processor(
244
+ input_ids, hidden_states, self.lm_head.weight, input_metadata
245
+ )
246
+
247
+ def load_weights(
248
+ self,
249
+ model_name_or_path: str,
250
+ cache_dir: Optional[str] = None,
251
+ load_format: str = "auto",
252
+ revision: Optional[str] = None,
253
+ ):
254
+ stacked_params_mapping = [
255
+ # (param_name, shard_name, shard_id)
256
+ ("qkv_proj", "q_proj", "q"),
257
+ ("qkv_proj", "k_proj", "k"),
258
+ ("qkv_proj", "v_proj", "v"),
259
+ ("gate_up_proj", "gate_proj", 0),
260
+ ("gate_up_proj", "up_proj", 1),
261
+ ]
262
+ params_dict = dict(self.named_parameters())
263
+ for name, loaded_weight in hf_model_weights_iterator(
264
+ model_name_or_path, cache_dir, load_format, revision
265
+ ):
266
+ if "rotary_emb.inv_freq" in name:
267
+ continue
268
+ if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
269
+ # Models trained using ColossalAI may include these tensors in
270
+ # the checkpoint. Skip them.
271
+ continue
272
+ for param_name, weight_name, shard_id in stacked_params_mapping:
273
+ if weight_name not in name:
274
+ continue
275
+ name = name.replace(weight_name, param_name)
276
+ # Skip loading extra bias for GPTQ models.
277
+ if name.endswith(".bias") and name not in params_dict:
278
+ continue
279
+ param = params_dict[name]
280
+ weight_loader = param.weight_loader
281
+ weight_loader(param, loaded_weight, shard_id)
282
+ break
283
+ else:
284
+ # Skip loading extra bias for GPTQ models.
285
+ if name.endswith(".bias") and name not in params_dict:
286
+ continue
287
+ param = params_dict[name]
288
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
289
+ weight_loader(param, loaded_weight)
290
+
291
+
292
+ EntryClass = StableLmForCausalLM
sglang/srt/models/yivl.py CHANGED
@@ -5,16 +5,17 @@ from typing import List, Optional
5
5
 
6
6
  import torch
7
7
  import torch.nn as nn
8
+ from transformers import CLIPVisionModel, LlavaConfig
9
+ from sglang.srt.weight_utils import (
10
+ default_weight_loader,
11
+ hf_model_weights_iterator,
12
+ )
13
+
8
14
  from sglang.srt.models.llava import (
9
15
  LlavaLlamaForCausalLM,
10
16
  clip_vision_embed_forward,
11
17
  monkey_path_clip_vision_embed_forward,
12
18
  )
13
- from transformers import CLIPVisionModel, LlavaConfig
14
- from vllm.model_executor.weight_utils import (
15
- default_weight_loader,
16
- hf_model_weights_iterator,
17
- )
18
19
 
19
20
 
20
21
  class YiVLForCausalLM(LlavaLlamaForCausalLM):