sglang 0.3.6.post2__py3-none-any.whl → 0.3.6.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,16 +15,19 @@
15
15
 
16
16
  import dataclasses
17
17
  import logging
18
+ import signal
18
19
  import threading
19
20
  from queue import Queue
20
21
  from typing import Optional
21
22
 
23
+ import psutil
22
24
  import torch
23
25
 
24
26
  from sglang.srt.managers.io_struct import UpdateWeightReqInput
25
27
  from sglang.srt.managers.schedule_batch import ModelWorkerBatch
26
28
  from sglang.srt.managers.tp_worker import TpModelWorker
27
29
  from sglang.srt.server_args import ServerArgs
30
+ from sglang.utils import get_exception_traceback
28
31
 
29
32
  logger = logging.getLogger(__name__)
30
33
 
@@ -70,6 +73,7 @@ class TpModelWorkerClient:
70
73
  target=self.forward_thread_func,
71
74
  )
72
75
  self.forward_thread.start()
76
+ self.parent_process = psutil.Process().parent()
73
77
 
74
78
  def get_worker_info(self):
75
79
  return self.worker.get_worker_info()
@@ -87,8 +91,13 @@ class TpModelWorkerClient:
87
91
  )
88
92
 
89
93
  def forward_thread_func(self):
90
- with torch.cuda.stream(self.forward_stream):
91
- self.forward_thread_func_()
94
+ try:
95
+ with torch.cuda.stream(self.forward_stream):
96
+ self.forward_thread_func_()
97
+ except Exception:
98
+ traceback = get_exception_traceback()
99
+ logger.error(f"TpModelWorkerClient hit an exception: {traceback}")
100
+ self.parent_process.send_signal(signal.SIGQUIT)
92
101
 
93
102
  @torch.no_grad()
94
103
  def forward_thread_func_(self):
sglang/srt/models/grok.py CHANGED
@@ -16,22 +16,17 @@
16
16
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral.py#L1
17
17
  """Inference-only Grok1 model."""
18
18
 
19
- import warnings
20
- from typing import Iterable, List, Optional, Tuple
19
+ from typing import Iterable, Optional, Tuple
21
20
 
22
21
  import torch
23
22
  import torch.nn.functional as F
24
23
  from torch import nn
25
24
  from transformers import PretrainedConfig
26
- from vllm.distributed import (
27
- get_tensor_model_parallel_rank,
28
- get_tensor_model_parallel_world_size,
29
- )
25
+ from vllm.distributed import get_tensor_model_parallel_world_size
30
26
  from vllm.model_executor.layers.rotary_embedding import get_rope
31
- from vllm.model_executor.model_loader.loader import DefaultModelLoader
32
27
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
33
28
 
34
- from sglang.srt.layers.fused_moe_grok import FusedMoE
29
+ from sglang.srt.layers.fused_moe_triton import FusedMoE
35
30
  from sglang.srt.layers.layernorm import RMSNorm
36
31
  from sglang.srt.layers.linear import (
37
32
  QKVParallelLinear,
@@ -41,10 +36,12 @@ from sglang.srt.layers.linear import (
41
36
  from sglang.srt.layers.logits_processor import LogitsProcessor
42
37
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
43
38
  from sglang.srt.layers.radix_attention import RadixAttention
39
+ from sglang.srt.layers.torchao_utils import apply_torchao_config_
44
40
  from sglang.srt.layers.vocab_parallel_embedding import (
45
41
  ParallelLMHead,
46
42
  VocabParallelEmbedding,
47
43
  )
44
+ from sglang.srt.managers.schedule_batch import global_server_args_dict
48
45
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
49
46
 
50
47
 
@@ -293,17 +290,11 @@ class Grok1ForCausalLM(nn.Module):
293
290
  super().__init__()
294
291
  self.config = config
295
292
  self.quant_config = quant_config
293
+ self.torchao_config = global_server_args_dict["torchao_config"]
296
294
  self.model = Grok1Model(config, quant_config=quant_config)
297
295
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
298
296
  self.logits_processor = LogitsProcessor(config)
299
297
 
300
- # Monkey patch _prepare_weights to load pre-sharded weights
301
- setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
302
-
303
- self.use_presharded_weights = True
304
-
305
- warnings.filterwarnings("ignore", category=FutureWarning)
306
-
307
298
  def forward(
308
299
  self,
309
300
  input_ids: torch.Tensor,
@@ -357,28 +348,23 @@ class Grok1ForCausalLM(nn.Module):
357
348
  continue
358
349
  name = name.replace(weight_name, param_name)
359
350
 
360
- if self.use_presharded_weights:
361
- extra_kwargs = {
362
- "use_presharded_weights": self.use_presharded_weights
363
- }
364
- else:
365
- extra_kwargs = {}
366
-
367
351
  param = params_dict[name]
368
352
  weight_loader = param.weight_loader
369
353
  weight_loader(
370
354
  param,
371
355
  loaded_weight,
372
- weight_name,
356
+ name,
373
357
  shard_id=shard_id,
374
358
  expert_id=expert_id,
375
- **extra_kwargs,
376
359
  )
377
360
  break
378
361
  else:
379
362
  # Skip loading extra bias for GPTQ models.
380
363
  if name.endswith(".bias") and name not in params_dict:
381
364
  continue
365
+ # Skip loading kv_scale from ckpts towards new design.
366
+ if name.endswith(".kv_scale") and name not in params_dict:
367
+ continue
382
368
  if name is None:
383
369
  continue
384
370
 
@@ -388,30 +374,7 @@ class Grok1ForCausalLM(nn.Module):
388
374
  )
389
375
  weight_loader(param, loaded_weight)
390
376
 
391
-
392
- old_prepare_weights = getattr(DefaultModelLoader, "_prepare_weights")
393
-
394
-
395
- def _prepare_presharded_weights(
396
- self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
397
- ) -> Tuple[str, List[str], bool]:
398
- import glob
399
- import os
400
-
401
- if get_tensor_model_parallel_world_size() == 1:
402
- return old_prepare_weights(self, model_name_or_path, revision, fall_back_to_pt)
403
-
404
- tp_rank = get_tensor_model_parallel_rank()
405
- allow_patterns = [f"*-{tp_rank:03d}.bin"]
406
-
407
- hf_folder = model_name_or_path
408
-
409
- hf_weights_files: List[str] = []
410
- for pattern in allow_patterns:
411
- hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
412
- use_safetensors = False
413
-
414
- return hf_folder, hf_weights_files, use_safetensors
377
+ apply_torchao_config_(self, params_dict, set(["proj.weight"]))
415
378
 
416
379
 
417
380
  class Grok1ModelForCausalLM(Grok1ForCausalLM):
@@ -57,7 +57,7 @@ class LlavaBaseForCausalLM(nn.Module):
57
57
  else:
58
58
  image_aspect_ratio = "anyres"
59
59
  offset_list = []
60
- for image_s in image_sizes:
60
+ for image_idx, image_s in enumerate(image_sizes):
61
61
  if len(image_sizes) > 16:
62
62
  # 2x2 pooling with stride 2
63
63
  new_image_feature_len = (
@@ -92,10 +92,6 @@ class LlavaBaseForCausalLM(nn.Module):
92
92
  new_w = int(new_w // times)
93
93
  new_image_feature_len += new_h * (new_w + 1)
94
94
 
95
- pad_ids = pad_values * (
96
- (new_image_feature_len + len(pad_values)) // len(pad_values)
97
- )
98
- # print("calculated new_image_feature_len: ", new_image_feature_len)
99
95
  try:
100
96
  offset = input_ids.index(self.config.image_token_index)
101
97
  except ValueError:
@@ -103,7 +99,7 @@ class LlavaBaseForCausalLM(nn.Module):
103
99
  # old_len + pad_len - 1, because we need to remove image_token_id
104
100
  input_ids = (
105
101
  input_ids[:offset]
106
- + pad_ids[:new_image_feature_len]
102
+ + [pad_values[image_idx]] * new_image_feature_len
107
103
  + input_ids[offset + 1 :]
108
104
  )
109
105
  offset_list.append(offset)
@@ -138,7 +134,6 @@ class LlavaBaseForCausalLM(nn.Module):
138
134
  image_inputs = forward_batch.image_inputs
139
135
 
140
136
  if forward_batch.forward_mode.is_extend():
141
- bs = forward_batch.batch_size
142
137
  # Got List[List[str]] extend it to List[str]
143
138
  # The length of the List should be equal to batch size
144
139
  modalities_list = []
@@ -146,11 +141,16 @@ class LlavaBaseForCausalLM(nn.Module):
146
141
  for im in image_inputs:
147
142
  if im and im.modalities is not None:
148
143
  modalities_list.extend(im.modalities)
149
- if im and im.image_offsets is not None:
144
+ if im and im.image_offsets:
150
145
  max_image_offset.append(max(im.image_offsets))
151
146
  else:
152
147
  max_image_offset.append(-1)
153
148
 
149
+ # Clamp input ids. This is because the input_ids for the image tokens are
150
+ # filled with the hash values of the image for the prefix matching in the radix attention.
151
+ # There values are useless because their embeddings will be replaced by vision embeddings anyway.
152
+ input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
153
+
154
154
  # Embed text inputs
155
155
  input_embeds = self.language_model.model.embed_tokens(input_ids)
156
156
 
@@ -158,6 +158,7 @@ class LlavaBaseForCausalLM(nn.Module):
158
158
  need_vision = start_positions <= np.array(max_image_offset)
159
159
 
160
160
  if need_vision.any():
161
+ bs = forward_batch.batch_size
161
162
  pixel_values = [
162
163
  image_inputs[i].pixel_values for i in range(bs) if need_vision[i]
163
164
  ]
@@ -0,0 +1,392 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
15
+ # Adapted from
16
+ # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/olmo2.py
17
+ """Inference-only OLMo2 model compatible with HuggingFace weights."""
18
+ from functools import partial
19
+ from typing import Iterable, Optional, Tuple
20
+
21
+ import torch
22
+ from torch import nn
23
+ from transformers import PretrainedConfig
24
+ from vllm.distributed import (
25
+ get_tensor_model_parallel_rank,
26
+ get_tensor_model_parallel_world_size,
27
+ split_tensor_along_last_dim,
28
+ tensor_model_parallel_all_gather,
29
+ )
30
+ from vllm.model_executor.layers.rotary_embedding import get_rope
31
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
32
+
33
+ from sglang.srt.layers.activation import SiluAndMul
34
+ from sglang.srt.layers.layernorm import RMSNorm
35
+ from sglang.srt.layers.linear import (
36
+ MergedColumnParallelLinear,
37
+ QKVParallelLinear,
38
+ RowParallelLinear,
39
+ )
40
+ from sglang.srt.layers.logits_processor import LogitsProcessor
41
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
+ from sglang.srt.layers.radix_attention import RadixAttention
43
+ from sglang.srt.layers.vocab_parallel_embedding import (
44
+ ParallelLMHead,
45
+ VocabParallelEmbedding,
46
+ )
47
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
48
+ from sglang.srt.utils import make_layers
49
+
50
+
51
+ class Olmo2Attention(nn.Module):
52
+ """
53
+ This is the attention block where the output is computed as
54
+ ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
55
+ (plus another skip connection).
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ config: PretrainedConfig,
61
+ layer_id: int = 0,
62
+ quant_config: Optional[QuantizationConfig] = None,
63
+ ):
64
+ super().__init__()
65
+ self.config = config
66
+ self.hidden_size = config.hidden_size
67
+ tp_size = get_tensor_model_parallel_world_size()
68
+ self.total_num_heads = config.num_attention_heads
69
+
70
+ assert self.hidden_size % self.total_num_heads == 0
71
+ assert self.total_num_heads % tp_size == 0
72
+
73
+ self.num_heads = self.total_num_heads // tp_size
74
+ self.total_num_kv_heads = self.config.num_key_value_heads
75
+
76
+ if self.total_num_kv_heads >= tp_size:
77
+ # Number of KV heads is greater than TP size, so we partition
78
+ # the KV heads across multiple tensor parallel GPUs.
79
+ assert self.total_num_kv_heads % tp_size == 0
80
+ else:
81
+ # Number of KV heads is less than TP size, so we replicate
82
+ # the KV heads across multiple tensor parallel GPUs.
83
+ assert tp_size % self.total_num_kv_heads == 0
84
+ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
85
+
86
+ self.head_dim = self.hidden_size // self.total_num_heads
87
+ self.max_position_embeddings = config.max_position_embeddings
88
+ self.rope_theta = config.rope_theta
89
+
90
+ # Attention input projection. Projects x -> (q, k, v)
91
+ self.qkv_proj = QKVParallelLinear(
92
+ self.hidden_size,
93
+ self.head_dim,
94
+ self.total_num_heads,
95
+ bias=config.attention_bias,
96
+ )
97
+ self.tp_rank = get_tensor_model_parallel_rank()
98
+
99
+ self.k_norm = RMSNorm(
100
+ self.total_num_kv_heads * self.head_dim,
101
+ eps=self.config.rms_norm_eps,
102
+ )
103
+ self.q_norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
104
+ # Rotary embeddings.
105
+ self.rotary_emb = get_rope(
106
+ self.head_dim,
107
+ rotary_dim=self.head_dim,
108
+ max_position=self.max_position_embeddings,
109
+ base=self.rope_theta,
110
+ )
111
+ self.scaling = self.head_dim**-0.5
112
+ self.attn = RadixAttention(
113
+ self.num_heads,
114
+ self.head_dim,
115
+ self.scaling,
116
+ num_kv_heads=self.num_kv_heads,
117
+ layer_id=layer_id,
118
+ )
119
+
120
+ # Attention output projection.
121
+ self.o_proj = RowParallelLinear(
122
+ self.head_dim * self.total_num_heads,
123
+ self.hidden_size,
124
+ bias=config.attention_bias,
125
+ )
126
+
127
+ def _apply_qk_norm(
128
+ self, q: torch.Tensor, k: torch.Tensor
129
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
130
+ if self.tp_size > 1:
131
+ q = tensor_model_parallel_all_gather(q.contiguous())
132
+ k = tensor_model_parallel_all_gather(k.contiguous())
133
+ q = self.q_norm.forward_native(q)
134
+ k = self.k_norm.forward_native(k)
135
+ if self.tp_size > 1:
136
+ splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
137
+ q = splitter(q)[self.tp_rank]
138
+ k = splitter(k)[self.tp_rank]
139
+ return q, k
140
+
141
+ def forward(
142
+ self,
143
+ positions: torch.Tensor,
144
+ hidden_states: torch.Tensor,
145
+ forward_batch: ForwardBatch,
146
+ ) -> torch.Tensor:
147
+ qkv, _ = self.qkv_proj(hidden_states)
148
+ q, k, v = qkv.chunk(chunks=3, dim=-1)
149
+ q, k = self._apply_qk_norm(q, k)
150
+ q, k = self.rotary_emb(positions, q, k)
151
+ attn_output = self.attn(q, k, v, forward_batch)
152
+ output, _ = self.o_proj(attn_output)
153
+ return output
154
+
155
+
156
+ class Olmo2MLP(nn.Module):
157
+ """
158
+ This is the MLP block where the output is computed as
159
+ ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
160
+ (plus another skip connection).
161
+ """
162
+
163
+ def __init__(
164
+ self,
165
+ config: PretrainedConfig,
166
+ quant_config: Optional[QuantizationConfig] = None,
167
+ ):
168
+ super().__init__()
169
+ self.config = config
170
+ self.hidden_size = config.hidden_size
171
+ self.intermediate_size = config.intermediate_size
172
+
173
+ # Feed-forward input projection.
174
+ self.gate_up_proj = MergedColumnParallelLinear(
175
+ self.hidden_size,
176
+ [self.intermediate_size] * 2,
177
+ bias=False,
178
+ quant_config=quant_config,
179
+ )
180
+
181
+ # Activation function.
182
+ self.act_fn = SiluAndMul()
183
+
184
+ # Feed-forward output projection.
185
+ self.down_proj = RowParallelLinear(
186
+ self.intermediate_size,
187
+ self.hidden_size,
188
+ bias=False,
189
+ quant_config=quant_config,
190
+ )
191
+
192
+ def forward(
193
+ self,
194
+ x: torch.Tensor,
195
+ ) -> torch.Tensor:
196
+ gate_up, _ = self.gate_up_proj(x)
197
+ x = self.act_fn(gate_up)
198
+ x, _ = self.down_proj(x)
199
+ return x
200
+
201
+
202
+ class Olmo2DecoderLayer(nn.Module):
203
+ """
204
+ This is a typical transformer block where the output is
205
+ computed as ``MLP(LN(x + Attention(LN(x))))``
206
+ (plus another skip connection).
207
+ """
208
+
209
+ def __init__(
210
+ self,
211
+ config: PretrainedConfig,
212
+ layer_id: int = 0,
213
+ quant_config: Optional[QuantizationConfig] = None,
214
+ ):
215
+ super().__init__()
216
+ # Attention block.
217
+ self.self_attn = Olmo2Attention(config, layer_id, quant_config)
218
+
219
+ # MLP block.
220
+ self.mlp = Olmo2MLP(config, quant_config)
221
+
222
+ # RMSNorm
223
+ self.post_attention_layernorm = RMSNorm(
224
+ config.hidden_size, eps=config.rms_norm_eps
225
+ )
226
+
227
+ self.post_feedforward_layernorm = RMSNorm(
228
+ config.hidden_size, eps=config.rms_norm_eps
229
+ )
230
+
231
+ def forward(
232
+ self,
233
+ positions: torch.Tensor,
234
+ hidden_states: torch.Tensor,
235
+ forward_batch: ForwardBatch,
236
+ ) -> torch.Tensor:
237
+ # Attention block.
238
+ residual = hidden_states
239
+ hidden_states = self.self_attn(positions, hidden_states, forward_batch)
240
+ hidden_states = self.post_attention_layernorm(hidden_states)
241
+ hidden_states = hidden_states + residual
242
+
243
+ # MLP block.
244
+ residual = hidden_states
245
+ hidden_states = self.mlp(hidden_states)
246
+ hidden_states = self.post_feedforward_layernorm(hidden_states)
247
+ hidden_states = residual + hidden_states
248
+ return hidden_states
249
+
250
+
251
+ class Olmo2Model(nn.Module):
252
+
253
+ def __init__(
254
+ self,
255
+ config: PretrainedConfig,
256
+ quant_config: Optional[QuantizationConfig] = None,
257
+ ):
258
+ super().__init__()
259
+ self.config = config
260
+
261
+ self.embed_tokens = VocabParallelEmbedding(
262
+ config.vocab_size, config.hidden_size
263
+ )
264
+ self.layers = make_layers(
265
+ config.num_hidden_layers,
266
+ lambda idx, prefix: Olmo2DecoderLayer(
267
+ layer_id=idx,
268
+ config=config,
269
+ quant_config=quant_config,
270
+ ),
271
+ )
272
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
273
+
274
+ def forward(
275
+ self,
276
+ input_ids: torch.Tensor,
277
+ positions: torch.Tensor,
278
+ forward_batch: ForwardBatch,
279
+ input_embeds: torch.Tensor = None,
280
+ ) -> torch.Tensor:
281
+ """
282
+ :param input_ids: A tensor of shape `(batch_size, seq_len)`.
283
+ """
284
+ # Get embeddings of input.
285
+ # shape: (batch_size, seq_len, d_model)
286
+
287
+ if input_embeds is None:
288
+ hidden_states = self.embed_tokens(input_ids)
289
+ else:
290
+ hidden_states = input_embeds
291
+
292
+ # Apply blocks one-by-one.
293
+ for layer_id, decoder_layer in enumerate(self.layers):
294
+ # shape: (batch_size, seq_len, d_model)
295
+ hidden_states = decoder_layer(
296
+ positions,
297
+ hidden_states,
298
+ forward_batch,
299
+ )
300
+
301
+ # Apply final layer norm.
302
+ # shape: (batch_size, seq_len or 1, d_model)
303
+ hidden_states = self.norm(hidden_states)
304
+ return hidden_states
305
+
306
+
307
+ class Olmo2ForCausalLM(nn.Module):
308
+ """
309
+ Extremely barebones HF model wrapper.
310
+ """
311
+
312
+ def __init__(
313
+ self,
314
+ config: PretrainedConfig,
315
+ cache_config=None,
316
+ quant_config: Optional[QuantizationConfig] = None,
317
+ ):
318
+ super().__init__()
319
+ self.config = config
320
+ self.model = Olmo2Model(config, quant_config)
321
+ if config.tie_word_embeddings:
322
+ self.lm_head = self.model.embed_tokens
323
+ else:
324
+ self.unpadded_vocab_size = config.vocab_size
325
+ self.lm_head = ParallelLMHead(
326
+ self.unpadded_vocab_size,
327
+ config.hidden_size,
328
+ org_num_embeddings=config.vocab_size,
329
+ quant_config=quant_config,
330
+ )
331
+ self.logits_processor = LogitsProcessor(config)
332
+
333
+ def forward(
334
+ self,
335
+ input_ids: torch.Tensor,
336
+ positions: torch.Tensor,
337
+ forward_batch: ForwardBatch,
338
+ input_embeds: torch.Tensor = None,
339
+ ) -> torch.Tensor:
340
+ hidden_states = self.model(
341
+ input_ids=input_ids,
342
+ positions=positions,
343
+ forward_batch=forward_batch,
344
+ input_embeds=input_embeds,
345
+ )
346
+ return self.logits_processor(
347
+ input_ids, hidden_states, self.lm_head.weight, forward_batch
348
+ )
349
+
350
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
351
+ stacked_params_mapping = [
352
+ # (param_name, shard_name, shard_id)
353
+ ("qkv_proj", "q_proj", "q"),
354
+ ("qkv_proj", "k_proj", "k"),
355
+ ("qkv_proj", "v_proj", "v"),
356
+ ("gate_up_proj", "gate_proj", 0),
357
+ ("gate_up_proj", "up_proj", 1),
358
+ ]
359
+ params_dict = dict(self.named_parameters(remove_duplicate=False))
360
+ for name, loaded_weight in weights:
361
+ if "rotary_emb.inv_freq" in name:
362
+ continue
363
+ if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
364
+ # Models trained using ColossalAI may include these tensors in
365
+ # the checkpoint. Skip them.
366
+ continue
367
+ # With tie_word_embeddings, we can skip lm_head.weight
368
+ # The weight might appear unnecessarily in the files if the model is
369
+ # processed with quantization, LoRA, fine-tuning, etc.
370
+ if self.config.tie_word_embeddings and "lm_head.weight" in name:
371
+ continue
372
+ for param_name, weight_name, shard_id in stacked_params_mapping:
373
+ if weight_name not in name:
374
+ continue
375
+ name = name.replace(weight_name, param_name)
376
+ # Skip loading extra bias for GPTQ models.
377
+ if name.endswith(".bias") and name not in params_dict:
378
+ continue
379
+ param = params_dict[name]
380
+ weight_loader = param.weight_loader
381
+ weight_loader(param, loaded_weight, shard_id)
382
+ break
383
+ else:
384
+ # Skip loading extra bias for GPTQ models.
385
+ if name.endswith(".bias") and name not in params_dict:
386
+ continue
387
+ param = params_dict[name]
388
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
389
+ weight_loader(param, loaded_weight)
390
+
391
+
392
+ EntryClass = Olmo2ForCausalLM
@@ -500,7 +500,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
500
500
  return num_image_tokens
501
501
 
502
502
  # Use grid_t * grid_w * grid_h to pad tokens for each image
503
- # and replaced padding by unique image hash
503
+ # add replaced padding by unique image hash
504
504
  def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
505
505
  image_grid_thws = image_inputs.image_grid_thws
506
506
  pad_values = image_inputs.pad_values
@@ -597,13 +597,15 @@ class Qwen2VLForConditionalGeneration(nn.Module):
597
597
  image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
598
598
  `None` if no images are passed.
599
599
  """
600
+ if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
601
+ positions = forward_batch.mrope_positions
602
+
600
603
  image_inputs = None
601
604
  if forward_batch.image_inputs is not None:
602
605
  image_inputs = [
603
606
  img for img in forward_batch.image_inputs if img is not None
604
607
  ]
605
- if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
606
- positions = forward_batch.mrope_positions
608
+
607
609
  if (
608
610
  forward_batch.forward_mode.is_decode()
609
611
  or image_inputs is None
@@ -617,6 +619,11 @@ class Qwen2VLForConditionalGeneration(nn.Module):
617
619
  f"(3, seq_len) positions, but got {positions.size()}"
618
620
  )
619
621
 
622
+ # Clamp input ids. This is because the input_ids for the image tokens are
623
+ # filled with the hash values of the image for the prefix matching in the radix attention.
624
+ # There values are useless because their embeddings will be replaced by vision embeddings anyway.
625
+ input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
626
+
620
627
  inputs_embeds = self.model.embed_tokens(input_ids)
621
628
  extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
622
629
  prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
@@ -1286,7 +1286,7 @@ def v1_embedding_request(all_requests, tokenizer_manager):
1286
1286
  else:
1287
1287
  prompt_kwargs = {"input_ids": prompt}
1288
1288
  else:
1289
- if isinstance(prompts[0], str) or isinstance(propmts[0][0], str):
1289
+ if isinstance(prompts[0], str) or isinstance(prompts[0][0], str):
1290
1290
  prompt_kwargs = {"text": prompts}
1291
1291
  else:
1292
1292
  prompt_kwargs = {"input_ids": prompts}