sglang 0.3.6.post1__py3-none-any.whl → 0.3.6.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sglang/bench_offline_throughput.py +55 -2
  2. sglang/bench_one_batch.py +4 -8
  3. sglang/bench_one_batch_server.py +6 -5
  4. sglang/check_env.py +7 -1
  5. sglang/lang/tracer.py +1 -1
  6. sglang/launch_server.py +2 -4
  7. sglang/srt/configs/model_config.py +2 -6
  8. sglang/srt/layers/attention/flashinfer_backend.py +3 -3
  9. sglang/srt/layers/sampler.py +1 -1
  10. sglang/srt/managers/data_parallel_controller.py +7 -11
  11. sglang/srt/managers/detokenizer_manager.py +7 -6
  12. sglang/srt/managers/image_processor.py +7 -10
  13. sglang/srt/managers/io_struct.py +0 -10
  14. sglang/srt/managers/schedule_batch.py +51 -13
  15. sglang/srt/managers/scheduler.py +41 -29
  16. sglang/srt/managers/session_controller.py +15 -7
  17. sglang/srt/managers/tokenizer_manager.py +4 -33
  18. sglang/srt/managers/tp_worker_overlap_thread.py +11 -2
  19. sglang/srt/models/grok.py +11 -48
  20. sglang/srt/models/llava.py +16 -9
  21. sglang/srt/models/olmo2.py +392 -0
  22. sglang/srt/models/qwen2_vl.py +10 -3
  23. sglang/srt/openai_api/adapter.py +1 -1
  24. sglang/srt/server.py +48 -45
  25. sglang/srt/server_args.py +1 -1
  26. sglang/srt/utils.py +22 -24
  27. sglang/test/test_utils.py +21 -8
  28. sglang/utils.py +2 -2
  29. sglang/version.py +1 -1
  30. {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/METADATA +4 -2
  31. {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/RECORD +34 -36
  32. sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
  33. sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
  34. sglang/srt/layers/fused_moe_grok/layer.py +0 -630
  35. {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/LICENSE +0 -0
  36. {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/WHEEL +0 -0
  37. {sglang-0.3.6.post1.dist-info → sglang-0.3.6.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,392 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
15
+ # Adapted from
16
+ # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/olmo2.py
17
+ """Inference-only OLMo2 model compatible with HuggingFace weights."""
18
+ from functools import partial
19
+ from typing import Iterable, Optional, Tuple
20
+
21
+ import torch
22
+ from torch import nn
23
+ from transformers import PretrainedConfig
24
+ from vllm.distributed import (
25
+ get_tensor_model_parallel_rank,
26
+ get_tensor_model_parallel_world_size,
27
+ split_tensor_along_last_dim,
28
+ tensor_model_parallel_all_gather,
29
+ )
30
+ from vllm.model_executor.layers.rotary_embedding import get_rope
31
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
32
+
33
+ from sglang.srt.layers.activation import SiluAndMul
34
+ from sglang.srt.layers.layernorm import RMSNorm
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
40
+ from sglang.srt.layers.logits_processor import LogitsProcessor
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
+ from sglang.srt.layers.radix_attention import RadixAttention
43
+ from sglang.srt.layers.vocab_parallel_embedding import (
44
+ ParallelLMHead,
45
+ VocabParallelEmbedding,
46
+ )
47
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
48
+ from sglang.srt.utils import make_layers
49
+
50
+
51
+ class Olmo2Attention(nn.Module):
52
+ """
53
+ This is the attention block where the output is computed as
54
+ ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
55
+ (plus another skip connection).
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ config: PretrainedConfig,
61
+ layer_id: int = 0,
62
+ quant_config: Optional[QuantizationConfig] = None,
63
+ ):
64
+ super().__init__()
65
+ self.config = config
66
+ self.hidden_size = config.hidden_size
67
+ tp_size = get_tensor_model_parallel_world_size()
68
+ self.total_num_heads = config.num_attention_heads
69
+
70
+ assert self.hidden_size % self.total_num_heads == 0
71
+ assert self.total_num_heads % tp_size == 0
72
+
73
+ self.num_heads = self.total_num_heads // tp_size
74
+ self.total_num_kv_heads = self.config.num_key_value_heads
75
+
76
+ if self.total_num_kv_heads >= tp_size:
77
+ # Number of KV heads is greater than TP size, so we partition
78
+ # the KV heads across multiple tensor parallel GPUs.
79
+ assert self.total_num_kv_heads % tp_size == 0
80
+ else:
81
+ # Number of KV heads is less than TP size, so we replicate
82
+ # the KV heads across multiple tensor parallel GPUs.
83
+ assert tp_size % self.total_num_kv_heads == 0
84
+ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
85
+
86
+ self.head_dim = self.hidden_size // self.total_num_heads
87
+ self.max_position_embeddings = config.max_position_embeddings
88
+ self.rope_theta = config.rope_theta
89
+
90
+ # Attention input projection. Projects x -> (q, k, v)
91
+ self.qkv_proj = QKVParallelLinear(
92
+ self.hidden_size,
93
+ self.head_dim,
94
+ self.total_num_heads,
95
+ bias=config.attention_bias,
96
+ )
97
+ self.tp_rank = get_tensor_model_parallel_rank()
98
+
99
+ self.k_norm = RMSNorm(
100
+ self.total_num_kv_heads * self.head_dim,
101
+ eps=self.config.rms_norm_eps,
102
+ )
103
+ self.q_norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
104
+ # Rotary embeddings.
105
+ self.rotary_emb = get_rope(
106
+ self.head_dim,
107
+ rotary_dim=self.head_dim,
108
+ max_position=self.max_position_embeddings,
109
+ base=self.rope_theta,
110
+ )
111
+ self.scaling = self.head_dim**-0.5
112
+ self.attn = RadixAttention(
113
+ self.num_heads,
114
+ self.head_dim,
115
+ self.scaling,
116
+ num_kv_heads=self.num_kv_heads,
117
+ layer_id=layer_id,
118
+ )
119
+
120
+ # Attention output projection.
121
+ self.o_proj = RowParallelLinear(
122
+ self.head_dim * self.total_num_heads,
123
+ self.hidden_size,
124
+ bias=config.attention_bias,
125
+ )
126
+
127
+ def _apply_qk_norm(
128
+ self, q: torch.Tensor, k: torch.Tensor
129
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
130
+ if self.tp_size > 1:
131
+ q = tensor_model_parallel_all_gather(q.contiguous())
132
+ k = tensor_model_parallel_all_gather(k.contiguous())
133
+ q = self.q_norm.forward_native(q)
134
+ k = self.k_norm.forward_native(k)
135
+ if self.tp_size > 1:
136
+ splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
137
+ q = splitter(q)[self.tp_rank]
138
+ k = splitter(k)[self.tp_rank]
139
+ return q, k
140
+
141
+ def forward(
142
+ self,
143
+ positions: torch.Tensor,
144
+ hidden_states: torch.Tensor,
145
+ forward_batch: ForwardBatch,
146
+ ) -> torch.Tensor:
147
+ qkv, _ = self.qkv_proj(hidden_states)
148
+ q, k, v = qkv.chunk(chunks=3, dim=-1)
149
+ q, k = self._apply_qk_norm(q, k)
150
+ q, k = self.rotary_emb(positions, q, k)
151
+ attn_output = self.attn(q, k, v, forward_batch)
152
+ output, _ = self.o_proj(attn_output)
153
+ return output
154
+
155
+
156
+ class Olmo2MLP(nn.Module):
157
+ """
158
+ This is the MLP block where the output is computed as
159
+ ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
160
+ (plus another skip connection).
161
+ """
162
+
163
+ def __init__(
164
+ self,
165
+ config: PretrainedConfig,
166
+ quant_config: Optional[QuantizationConfig] = None,
167
+ ):
168
+ super().__init__()
169
+ self.config = config
170
+ self.hidden_size = config.hidden_size
171
+ self.intermediate_size = config.intermediate_size
172
+
173
+ # Feed-forward input projection.
174
+ self.gate_up_proj = MergedColumnParallelLinear(
175
+ self.hidden_size,
176
+ [self.intermediate_size] * 2,
177
+ bias=False,
178
+ quant_config=quant_config,
179
+ )
180
+
181
+ # Activation function.
182
+ self.act_fn = SiluAndMul()
183
+
184
+ # Feed-forward output projection.
185
+ self.down_proj = RowParallelLinear(
186
+ self.intermediate_size,
187
+ self.hidden_size,
188
+ bias=False,
189
+ quant_config=quant_config,
190
+ )
191
+
192
+ def forward(
193
+ self,
194
+ x: torch.Tensor,
195
+ ) -> torch.Tensor:
196
+ gate_up, _ = self.gate_up_proj(x)
197
+ x = self.act_fn(gate_up)
198
+ x, _ = self.down_proj(x)
199
+ return x
200
+
201
+
202
+ class Olmo2DecoderLayer(nn.Module):
203
+ """
204
+ This is a typical transformer block where the output is
205
+ computed as ``MLP(LN(x + Attention(LN(x))))``
206
+ (plus another skip connection).
207
+ """
208
+
209
+ def __init__(
210
+ self,
211
+ config: PretrainedConfig,
212
+ layer_id: int = 0,
213
+ quant_config: Optional[QuantizationConfig] = None,
214
+ ):
215
+ super().__init__()
216
+ # Attention block.
217
+ self.self_attn = Olmo2Attention(config, layer_id, quant_config)
218
+
219
+ # MLP block.
220
+ self.mlp = Olmo2MLP(config, quant_config)
221
+
222
+ # RMSNorm
223
+ self.post_attention_layernorm = RMSNorm(
224
+ config.hidden_size, eps=config.rms_norm_eps
225
+ )
226
+
227
+ self.post_feedforward_layernorm = RMSNorm(
228
+ config.hidden_size, eps=config.rms_norm_eps
229
+ )
230
+
231
+ def forward(
232
+ self,
233
+ positions: torch.Tensor,
234
+ hidden_states: torch.Tensor,
235
+ forward_batch: ForwardBatch,
236
+ ) -> torch.Tensor:
237
+ # Attention block.
238
+ residual = hidden_states
239
+ hidden_states = self.self_attn(positions, hidden_states, forward_batch)
240
+ hidden_states = self.post_attention_layernorm(hidden_states)
241
+ hidden_states = hidden_states + residual
242
+
243
+ # MLP block.
244
+ residual = hidden_states
245
+ hidden_states = self.mlp(hidden_states)
246
+ hidden_states = self.post_feedforward_layernorm(hidden_states)
247
+ hidden_states = residual + hidden_states
248
+ return hidden_states
249
+
250
+
251
+ class Olmo2Model(nn.Module):
252
+
253
+ def __init__(
254
+ self,
255
+ config: PretrainedConfig,
256
+ quant_config: Optional[QuantizationConfig] = None,
257
+ ):
258
+ super().__init__()
259
+ self.config = config
260
+
261
+ self.embed_tokens = VocabParallelEmbedding(
262
+ config.vocab_size, config.hidden_size
263
+ )
264
+ self.layers = make_layers(
265
+ config.num_hidden_layers,
266
+ lambda idx, prefix: Olmo2DecoderLayer(
267
+ layer_id=idx,
268
+ config=config,
269
+ quant_config=quant_config,
270
+ ),
271
+ )
272
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
273
+
274
+ def forward(
275
+ self,
276
+ input_ids: torch.Tensor,
277
+ positions: torch.Tensor,
278
+ forward_batch: ForwardBatch,
279
+ input_embeds: torch.Tensor = None,
280
+ ) -> torch.Tensor:
281
+ """
282
+ :param input_ids: A tensor of shape `(batch_size, seq_len)`.
283
+ """
284
+ # Get embeddings of input.
285
+ # shape: (batch_size, seq_len, d_model)
286
+
287
+ if input_embeds is None:
288
+ hidden_states = self.embed_tokens(input_ids)
289
+ else:
290
+ hidden_states = input_embeds
291
+
292
+ # Apply blocks one-by-one.
293
+ for layer_id, decoder_layer in enumerate(self.layers):
294
+ # shape: (batch_size, seq_len, d_model)
295
+ hidden_states = decoder_layer(
296
+ positions,
297
+ hidden_states,
298
+ forward_batch,
299
+ )
300
+
301
+ # Apply final layer norm.
302
+ # shape: (batch_size, seq_len or 1, d_model)
303
+ hidden_states = self.norm(hidden_states)
304
+ return hidden_states
305
+
306
+
307
+ class Olmo2ForCausalLM(nn.Module):
308
+ """
309
+ Extremely barebones HF model wrapper.
310
+ """
311
+
312
+ def __init__(
313
+ self,
314
+ config: PretrainedConfig,
315
+ cache_config=None,
316
+ quant_config: Optional[QuantizationConfig] = None,
317
+ ):
318
+ super().__init__()
319
+ self.config = config
320
+ self.model = Olmo2Model(config, quant_config)
321
+ if config.tie_word_embeddings:
322
+ self.lm_head = self.model.embed_tokens
323
+ else:
324
+ self.unpadded_vocab_size = config.vocab_size
325
+ self.lm_head = ParallelLMHead(
326
+ self.unpadded_vocab_size,
327
+ config.hidden_size,
328
+ org_num_embeddings=config.vocab_size,
329
+ quant_config=quant_config,
330
+ )
331
+ self.logits_processor = LogitsProcessor(config)
332
+
333
+ def forward(
334
+ self,
335
+ input_ids: torch.Tensor,
336
+ positions: torch.Tensor,
337
+ forward_batch: ForwardBatch,
338
+ input_embeds: torch.Tensor = None,
339
+ ) -> torch.Tensor:
340
+ hidden_states = self.model(
341
+ input_ids=input_ids,
342
+ positions=positions,
343
+ forward_batch=forward_batch,
344
+ input_embeds=input_embeds,
345
+ )
346
+ return self.logits_processor(
347
+ input_ids, hidden_states, self.lm_head.weight, forward_batch
348
+ )
349
+
350
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
351
+ stacked_params_mapping = [
352
+ # (param_name, shard_name, shard_id)
353
+ ("qkv_proj", "q_proj", "q"),
354
+ ("qkv_proj", "k_proj", "k"),
355
+ ("qkv_proj", "v_proj", "v"),
356
+ ("gate_up_proj", "gate_proj", 0),
357
+ ("gate_up_proj", "up_proj", 1),
358
+ ]
359
+ params_dict = dict(self.named_parameters(remove_duplicate=False))
360
+ for name, loaded_weight in weights:
361
+ if "rotary_emb.inv_freq" in name:
362
+ continue
363
+ if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
364
+ # Models trained using ColossalAI may include these tensors in
365
+ # the checkpoint. Skip them.
366
+ continue
367
+ # With tie_word_embeddings, we can skip lm_head.weight
368
+ # The weight might appear unnecessarily in the files if the model is
369
+ # processed with quantization, LoRA, fine-tuning, etc.
370
+ if self.config.tie_word_embeddings and "lm_head.weight" in name:
371
+ continue
372
+ for param_name, weight_name, shard_id in stacked_params_mapping:
373
+ if weight_name not in name:
374
+ continue
375
+ name = name.replace(weight_name, param_name)
376
+ # Skip loading extra bias for GPTQ models.
377
+ if name.endswith(".bias") and name not in params_dict:
378
+ continue
379
+ param = params_dict[name]
380
+ weight_loader = param.weight_loader
381
+ weight_loader(param, loaded_weight, shard_id)
382
+ break
383
+ else:
384
+ # Skip loading extra bias for GPTQ models.
385
+ if name.endswith(".bias") and name not in params_dict:
386
+ continue
387
+ param = params_dict[name]
388
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
389
+ weight_loader(param, loaded_weight)
390
+
391
+
392
+ EntryClass = Olmo2ForCausalLM
@@ -500,7 +500,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
500
500
  return num_image_tokens
501
501
 
502
502
  # Use grid_t * grid_w * grid_h to pad tokens for each image
503
- # and replaced padding by unique image hash
503
+ # add replaced padding by unique image hash
504
504
  def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
505
505
  image_grid_thws = image_inputs.image_grid_thws
506
506
  pad_values = image_inputs.pad_values
@@ -597,13 +597,15 @@ class Qwen2VLForConditionalGeneration(nn.Module):
597
597
  image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
598
598
  `None` if no images are passed.
599
599
  """
600
+ if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
601
+ positions = forward_batch.mrope_positions
602
+
600
603
  image_inputs = None
601
604
  if forward_batch.image_inputs is not None:
602
605
  image_inputs = [
603
606
  img for img in forward_batch.image_inputs if img is not None
604
607
  ]
605
- if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
606
- positions = forward_batch.mrope_positions
608
+
607
609
  if (
608
610
  forward_batch.forward_mode.is_decode()
609
611
  or image_inputs is None
@@ -617,6 +619,11 @@ class Qwen2VLForConditionalGeneration(nn.Module):
617
619
  f"(3, seq_len) positions, but got {positions.size()}"
618
620
  )
619
621
 
622
+ # Clamp input ids. This is because the input_ids for the image tokens are
623
+ # filled with the hash values of the image for the prefix matching in the radix attention.
624
+ # There values are useless because their embeddings will be replaced by vision embeddings anyway.
625
+ input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
626
+
620
627
  inputs_embeds = self.model.embed_tokens(input_ids)
621
628
  extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
622
629
  prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
@@ -1286,7 +1286,7 @@ def v1_embedding_request(all_requests, tokenizer_manager):
1286
1286
  else:
1287
1287
  prompt_kwargs = {"input_ids": prompt}
1288
1288
  else:
1289
- if isinstance(prompts[0], str) or isinstance(propmts[0][0], str):
1289
+ if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
1290
1290
  prompt_kwargs = {"text": prompts}
1291
1291
  else:
1292
1292
  prompt_kwargs = {"input_ids": prompts}