sglang 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. sglang/__init__.py +1 -1
  2. sglang/api.py +26 -0
  3. sglang/backend/runtime_endpoint.py +18 -14
  4. sglang/bench_latency.py +40 -18
  5. sglang/global_config.py +21 -16
  6. sglang/lang/chat_template.py +41 -6
  7. sglang/lang/interpreter.py +5 -1
  8. sglang/lang/ir.py +61 -25
  9. sglang/srt/constrained/__init__.py +3 -2
  10. sglang/srt/hf_transformers_utils.py +7 -3
  11. sglang/srt/layers/extend_attention.py +2 -1
  12. sglang/srt/layers/fused_moe.py +181 -167
  13. sglang/srt/layers/logits_processor.py +55 -19
  14. sglang/srt/layers/radix_attention.py +33 -59
  15. sglang/srt/layers/token_attention.py +4 -8
  16. sglang/srt/managers/controller/cuda_graph_runner.py +172 -0
  17. sglang/srt/managers/controller/infer_batch.py +244 -36
  18. sglang/srt/managers/controller/manager_single.py +1 -1
  19. sglang/srt/managers/controller/model_runner.py +69 -284
  20. sglang/srt/managers/controller/tp_worker.py +39 -20
  21. sglang/srt/managers/detokenizer_manager.py +4 -2
  22. sglang/srt/managers/io_struct.py +1 -1
  23. sglang/srt/managers/tokenizer_manager.py +14 -13
  24. sglang/srt/memory_pool.py +33 -6
  25. sglang/srt/model_config.py +6 -0
  26. sglang/srt/models/gemma2.py +436 -0
  27. sglang/srt/models/llama2.py +3 -3
  28. sglang/srt/models/llama_classification.py +10 -7
  29. sglang/srt/models/minicpm.py +373 -0
  30. sglang/srt/models/qwen2_moe.py +454 -0
  31. sglang/srt/openai_api_adapter.py +2 -2
  32. sglang/srt/openai_protocol.py +1 -1
  33. sglang/srt/server.py +18 -8
  34. sglang/srt/server_args.py +24 -20
  35. sglang/srt/utils.py +68 -35
  36. {sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/METADATA +19 -13
  37. {sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/RECORD +40 -36
  38. {sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/WHEEL +1 -1
  39. {sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/LICENSE +0 -0
  40. {sglang-0.1.18.dist-info → sglang-0.1.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,373 @@
1
+ """Inference-only MiniCPM model compatible with HuggingFace weights."""
2
+
3
+ import math
4
+ from typing import Any, Dict, Iterable, Optional, Tuple
5
+
6
+ import torch
7
+ from torch import nn
8
+
9
+ from vllm.config import CacheConfig
10
+ from vllm.distributed import get_tensor_model_parallel_world_size
11
+
12
+ from vllm.model_executor.layers.activation import SiluAndMul
13
+
14
+ from vllm.model_executor.layers.layernorm import RMSNorm
15
+ from vllm.model_executor.layers.linear import (
16
+ MergedColumnParallelLinear,
17
+ QKVParallelLinear,
18
+ RowParallelLinear,
19
+ )
20
+ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
21
+ from vllm.model_executor.layers.rotary_embedding import get_rope
22
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
23
+ ParallelLMHead,
24
+ VocabParallelEmbedding,
25
+ )
26
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
27
+
28
+ from sglang.srt.layers.logits_processor import LogitsProcessor
29
+ from sglang.srt.layers.radix_attention import RadixAttention
30
+ from sglang.srt.managers.controller.model_runner import InputMetadata
31
+
32
+
33
+ class MiniCPMMLP(nn.Module):
34
+
35
+ def __init__(
36
+ self,
37
+ hidden_size: int,
38
+ intermediate_size: int,
39
+ hidden_act: str,
40
+ quant_config: Optional[QuantizationConfig] = None,
41
+ ) -> None:
42
+ super().__init__()
43
+ self.gate_up_proj = MergedColumnParallelLinear(
44
+ hidden_size,
45
+ [intermediate_size] * 2,
46
+ bias=False,
47
+ quant_config=quant_config,
48
+ )
49
+ self.down_proj = RowParallelLinear(
50
+ intermediate_size,
51
+ hidden_size,
52
+ bias=False,
53
+ quant_config=quant_config,
54
+ )
55
+ if hidden_act != "silu":
56
+ raise ValueError(
57
+ f"Unsupported activation: {hidden_act}. "
58
+ "Only silu is supported for now."
59
+ )
60
+ self.act_fn = SiluAndMul()
61
+
62
+ def forward(self, x):
63
+ gate_up, _ = self.gate_up_proj(x)
64
+ x = self.act_fn(gate_up)
65
+ x, _ = self.down_proj(x)
66
+ return x
67
+
68
+
69
+ class MiniCPMAttention(nn.Module):
70
+
71
+ def __init__(
72
+ self,
73
+ hidden_size: int,
74
+ num_heads: int,
75
+ num_kv_heads: int,
76
+ layer_id: int = 0,
77
+ rope_theta: float = 10000,
78
+ rope_scaling: Optional[Dict[str, Any]] = None,
79
+ max_position_embeddings: int = 8192,
80
+ quant_config: Optional[QuantizationConfig] = None,
81
+ ) -> None:
82
+ super().__init__()
83
+ self.hidden_size = hidden_size
84
+ tp_size = get_tensor_model_parallel_world_size()
85
+ self.total_num_heads = num_heads
86
+ assert self.total_num_heads % tp_size == 0
87
+ self.num_heads = self.total_num_heads // tp_size
88
+ self.total_num_kv_heads = num_kv_heads
89
+ if self.total_num_kv_heads >= tp_size:
90
+ # Number of KV heads is greater than TP size, so we partition
91
+ # the KV heads across multiple tensor parallel GPUs.
92
+ assert self.total_num_kv_heads % tp_size == 0
93
+ else:
94
+ # Number of KV heads is less than TP size, so we replicate
95
+ # the KV heads across multiple tensor parallel GPUs.
96
+ assert tp_size % self.total_num_kv_heads == 0
97
+ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
98
+ self.head_dim = hidden_size // self.total_num_heads
99
+ self.q_size = self.num_heads * self.head_dim
100
+ self.kv_size = self.num_kv_heads * self.head_dim
101
+ self.scaling = self.head_dim**-0.5
102
+ self.rope_theta = rope_theta
103
+ self.max_position_embeddings = max_position_embeddings
104
+
105
+ self.qkv_proj = QKVParallelLinear(
106
+ hidden_size,
107
+ self.head_dim,
108
+ self.total_num_heads,
109
+ self.total_num_kv_heads,
110
+ bias=False,
111
+ quant_config=quant_config,
112
+ )
113
+ self.o_proj = RowParallelLinear(
114
+ self.total_num_heads * self.head_dim,
115
+ hidden_size,
116
+ bias=False,
117
+ quant_config=quant_config,
118
+ )
119
+
120
+ self.rotary_emb = get_rope(
121
+ self.head_dim,
122
+ rotary_dim=self.head_dim,
123
+ max_position=max_position_embeddings,
124
+ base=rope_theta,
125
+ rope_scaling=rope_scaling,
126
+ )
127
+ # set rope as fp32 instead of bf16
128
+ self.rotary_emb.cos_sin_cache = self.rotary_emb._compute_cos_sin_cache()
129
+ self.attn = RadixAttention(
130
+ self.num_heads,
131
+ self.head_dim,
132
+ self.scaling,
133
+ num_kv_heads=self.num_kv_heads,
134
+ layer_id=layer_id,
135
+ )
136
+
137
+ def forward(
138
+ self,
139
+ positions: torch.Tensor,
140
+ hidden_states: torch.Tensor,
141
+ input_metadata: InputMetadata,
142
+ ) -> torch.Tensor:
143
+ qkv, _ = self.qkv_proj(hidden_states)
144
+ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
145
+ orig_dtype = q.dtype
146
+ q, k = q.float(), k.float()
147
+ q, k = self.rotary_emb(positions, q, k)
148
+ q, k = q.to(orig_dtype), k.to(orig_dtype)
149
+ attn_output = self.attn(q, k, v, input_metadata)
150
+ output, _ = self.o_proj(attn_output)
151
+ return output
152
+
153
+
154
+ class MiniCPMDecoderLayer(nn.Module):
155
+
156
+ def __init__(
157
+ self,
158
+ config,
159
+ layer_id: int = 0,
160
+ quant_config: Optional[QuantizationConfig] = None,
161
+ ) -> None:
162
+ super().__init__()
163
+ self.config = config
164
+ self.hidden_size = config.hidden_size
165
+ rope_theta = getattr(config, "rope_theta", 10000)
166
+ rope_scaling = getattr(config, "rope_scaling", None)
167
+ max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
168
+ self.self_attn = MiniCPMAttention(
169
+ hidden_size=self.hidden_size,
170
+ num_heads=config.num_attention_heads,
171
+ num_kv_heads=config.num_key_value_heads,
172
+ layer_id=layer_id,
173
+ rope_theta=rope_theta,
174
+ rope_scaling=rope_scaling,
175
+ max_position_embeddings=max_position_embeddings,
176
+ quant_config=quant_config,
177
+ )
178
+ self.mlp = MiniCPMMLP(
179
+ hidden_size=self.hidden_size,
180
+ intermediate_size=config.intermediate_size,
181
+ hidden_act=config.hidden_act,
182
+ quant_config=quant_config,
183
+ )
184
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
185
+ self.post_attention_layernorm = RMSNorm(
186
+ config.hidden_size, eps=config.rms_norm_eps
187
+ )
188
+
189
+ def forward(
190
+ self,
191
+ positions: torch.Tensor,
192
+ hidden_states: torch.Tensor,
193
+ input_metadata: InputMetadata,
194
+ residual: Optional[torch.Tensor],
195
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
196
+ # Self Attention
197
+ residual = hidden_states
198
+ hidden_states = self.input_layernorm(hidden_states)
199
+ hidden_states = self.self_attn(
200
+ positions=positions,
201
+ hidden_states=hidden_states,
202
+ input_metadata=input_metadata,
203
+ )
204
+ hidden_states = residual + hidden_states * (
205
+ self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
206
+ )
207
+
208
+ # Fully Connected
209
+ residual = hidden_states
210
+ hidden_states = self.post_attention_layernorm(hidden_states)
211
+ hidden_states = self.mlp(hidden_states)
212
+ hidden_states = residual + hidden_states * (
213
+ self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
214
+ )
215
+
216
+ return hidden_states, None
217
+
218
+
219
+ class MiniCPMModel(nn.Module):
220
+
221
+ def __init__(
222
+ self,
223
+ config,
224
+ quant_config: Optional[QuantizationConfig] = None,
225
+ ) -> None:
226
+ super().__init__()
227
+ self.config = config
228
+ self.padding_idx = config.pad_token_id
229
+ self.vocab_size = config.vocab_size
230
+ self.embed_tokens = VocabParallelEmbedding(
231
+ self.vocab_size,
232
+ config.hidden_size,
233
+ org_num_embeddings=config.vocab_size,
234
+ )
235
+ self.layers = nn.ModuleList(
236
+ [
237
+ MiniCPMDecoderLayer(config, i, quant_config=quant_config)
238
+ for i in range(config.num_hidden_layers)
239
+ ]
240
+ )
241
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
242
+
243
+ def forward(
244
+ self,
245
+ input_ids: torch.Tensor,
246
+ positions: torch.Tensor,
247
+ input_metadata: InputMetadata,
248
+ input_embeds: torch.Tensor = None,
249
+ ) -> torch.Tensor:
250
+ if input_embeds is None:
251
+ hidden_states = self.embed_tokens(input_ids) * self.config.scale_emb
252
+ else:
253
+ hidden_states = input_embeds
254
+ residual = None
255
+
256
+ for i in range(len(self.layers)):
257
+ layer = self.layers[i]
258
+ hidden_states, residual = layer(
259
+ positions,
260
+ hidden_states,
261
+ input_metadata,
262
+ residual,
263
+ )
264
+ hidden_states = self.norm(hidden_states)
265
+ return hidden_states
266
+
267
+
268
+ class MiniCPMForCausalLM(nn.Module):
269
+ def __init__(
270
+ self,
271
+ config,
272
+ quant_config: Optional[QuantizationConfig] = None,
273
+ cache_config: Optional[CacheConfig] = None,
274
+ ) -> None:
275
+ super().__init__()
276
+ self.config = config
277
+
278
+ self.num_experts = getattr(self.config, "num_experts", 0)
279
+ self.quant_config = quant_config
280
+ self.model = MiniCPMModel(config, quant_config=quant_config)
281
+ # self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
282
+ if not self.config.tie_word_embeddings:
283
+ self.lm_head = ParallelLMHead(
284
+ config.vocab_size,
285
+ config.hidden_size,
286
+ org_num_embeddings=config.vocab_size,
287
+ )
288
+
289
+ self.scale_width = self.config.hidden_size / self.config.dim_model_base
290
+
291
+ self.logits_processor = LogitsProcessor(config)
292
+
293
+ def forward(
294
+ self,
295
+ input_ids: torch.Tensor,
296
+ positions: torch.Tensor,
297
+ input_metadata: InputMetadata,
298
+ input_embeds: torch.Tensor = None,
299
+ ) -> torch.Tensor:
300
+ if input_embeds is not None:
301
+ input_embeds = input_embeds * self.config.scale_emb
302
+ hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
303
+ hidden_states = hidden_states / self.scale_width
304
+ if self.config.tie_word_embeddings:
305
+ lm_head_weight = self.model.embed_tokens.weight
306
+ else:
307
+ lm_head_weight = self.lm_head.weight
308
+ return self.logits_processor(
309
+ input_ids, hidden_states, lm_head_weight, input_metadata
310
+ )
311
+
312
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
313
+ stacked_params_mapping = [
314
+ # (param_name, shard_name, shard_id)
315
+ ("qkv_proj", "q_proj", "q"),
316
+ ("qkv_proj", "k_proj", "k"),
317
+ ("qkv_proj", "v_proj", "v"),
318
+ ("gate_up_proj", "gate_proj", 0),
319
+ ("gate_up_proj", "up_proj", 1),
320
+ ]
321
+ expert_params_mapping = [
322
+ # (param_name, weight_name, expert_id)
323
+ (
324
+ "ws" if weight_name in ["w1", "w3"] else "w2s",
325
+ f"experts.{expert_id}.{weight_name}.weight",
326
+ expert_id,
327
+ )
328
+ for expert_id in range(self.num_experts)
329
+ for weight_name in ["w1", "w2", "w3"]
330
+ ]
331
+ params_dict = dict(self.named_parameters())
332
+ for name, loaded_weight in weights:
333
+ if "rotary_emb.inv_freq" in name:
334
+ continue
335
+ if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
336
+ # Models trained using ColossalAI may include these tensors in
337
+ # the checkpoint. Skip them.
338
+ continue
339
+
340
+ for param_name, weight_name, shard_id in stacked_params_mapping:
341
+ if weight_name not in name:
342
+ continue
343
+ name = name.replace(weight_name, param_name)
344
+ # Skip loading extra bias for GPTQ models.
345
+ if name.endswith(".bias") and name not in params_dict:
346
+ continue
347
+ param = params_dict[name]
348
+ weight_loader = param.weight_loader
349
+ weight_loader(param, loaded_weight, shard_id)
350
+ break
351
+ else:
352
+ for param_name, weight_name, expert_id in expert_params_mapping:
353
+ if weight_name not in name:
354
+ continue
355
+ name = name.replace(weight_name, param_name)
356
+ param = params_dict[name]
357
+ weight_loader = param.weight_loader
358
+ weight_loader(
359
+ param, loaded_weight, weight_name, expert_id=expert_id
360
+ )
361
+ break
362
+ else:
363
+ # Skip loading extra bias for GPTQ models.
364
+ if name.endswith(".bias") and name not in params_dict:
365
+ continue
366
+ param = params_dict[name]
367
+ weight_loader = getattr(
368
+ param, "weight_loader", default_weight_loader
369
+ )
370
+ weight_loader(param, loaded_weight)
371
+
372
+
373
+ EntryClass = MiniCPMForCausalLM