sglang 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. sglang/srt/constrained/outlines_backend.py +9 -1
  2. sglang/srt/custom_op.py +40 -0
  3. sglang/srt/entrypoints/engine.py +2 -2
  4. sglang/srt/layers/activation.py +10 -5
  5. sglang/srt/layers/attention/flashinfer_backend.py +284 -39
  6. sglang/srt/layers/attention/triton_backend.py +71 -7
  7. sglang/srt/layers/attention/triton_ops/decode_attention.py +53 -59
  8. sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
  9. sglang/srt/layers/attention/vision.py +243 -40
  10. sglang/srt/layers/layernorm.py +1 -5
  11. sglang/srt/layers/moe/ep_moe/layer.py +1 -3
  12. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  13. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  14. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +200 -0
  15. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +200 -0
  16. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +200 -0
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +178 -0
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +200 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +175 -0
  20. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -11
  21. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -3
  22. sglang/srt/layers/moe/topk.py +4 -0
  23. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  24. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  25. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  26. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  27. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  30. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  31. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  32. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  33. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  34. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  35. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  36. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  37. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  39. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  40. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  41. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  42. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  44. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/fp8.py +7 -0
  46. sglang/srt/layers/quantization/fp8_kernel.py +140 -2
  47. sglang/srt/layers/rotary_embedding.py +29 -15
  48. sglang/srt/layers/sampler.py +9 -6
  49. sglang/srt/lora/backend/__init__.py +8 -0
  50. sglang/srt/lora/backend/base_backend.py +95 -0
  51. sglang/srt/lora/backend/flashinfer_backend.py +91 -0
  52. sglang/srt/lora/backend/triton_backend.py +61 -0
  53. sglang/srt/lora/lora.py +127 -112
  54. sglang/srt/lora/lora_manager.py +50 -18
  55. sglang/srt/lora/triton_ops/__init__.py +5 -0
  56. sglang/srt/lora/triton_ops/qkv_lora_b.py +182 -0
  57. sglang/srt/lora/triton_ops/sgemm_lora_a.py +143 -0
  58. sglang/srt/lora/triton_ops/sgemm_lora_b.py +159 -0
  59. sglang/srt/managers/image_processor.py +77 -38
  60. sglang/srt/managers/scheduler.py +17 -3
  61. sglang/srt/mem_cache/base_prefix_cache.py +4 -0
  62. sglang/srt/mem_cache/chunk_cache.py +3 -0
  63. sglang/srt/mem_cache/radix_cache.py +30 -1
  64. sglang/srt/model_executor/cuda_graph_runner.py +77 -80
  65. sglang/srt/model_executor/forward_batch_info.py +58 -59
  66. sglang/srt/model_executor/model_runner.py +2 -2
  67. sglang/srt/models/minicpmv.py +129 -76
  68. sglang/srt/models/mllama.py +16 -56
  69. sglang/srt/models/qwen2.py +4 -1
  70. sglang/srt/models/qwen2_vl.py +19 -9
  71. sglang/srt/server_args.py +19 -2
  72. sglang/srt/speculative/build_eagle_tree.py +4 -2
  73. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +213 -0
  74. sglang/srt/speculative/eagle_utils.py +361 -372
  75. sglang/srt/speculative/eagle_worker.py +177 -45
  76. sglang/srt/utils.py +7 -2
  77. sglang/test/runners.py +2 -0
  78. sglang/utils.py +42 -0
  79. sglang/version.py +1 -1
  80. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/METADATA +16 -7
  81. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/RECORD +84 -45
  82. sglang/srt/layers/custom_op_util.py +0 -25
  83. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/LICENSE +0 -0
  84. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/WHEEL +0 -0
  85. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  # Adapted from
2
2
  # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
3
- # Copyright 2023 The vLLM team.
3
+ # Copyright 2023 The SGLang team.
4
4
  # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
5
5
  #
6
6
  # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
@@ -20,7 +20,7 @@
20
20
  # See the License for the specific language governing permissions and
21
21
  # limitations under the License.
22
22
  """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
23
- from functools import cached_property, partial
23
+ from functools import partial
24
24
  from typing import (
25
25
  Any,
26
26
  Callable,
@@ -33,16 +33,13 @@ from typing import (
33
33
  Union,
34
34
  )
35
35
 
36
+ import numpy as np
36
37
  import torch
37
38
  import torch.types
38
39
  from PIL import Image
39
40
  from torch import nn
40
41
  from torch.nn.init import trunc_normal_
41
42
  from transformers import PretrainedConfig
42
- from vllm.model_executor.layers.resampler import get_2d_sincos_pos_embed
43
- from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
44
- from vllm.model_executor.models.module_mapping import MultiModelKeys
45
- from vllm.model_executor.sampling_metadata import SamplingMetadata
46
43
 
47
44
  from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
48
45
  from sglang.srt.layers.activation import get_act_fn
@@ -63,6 +60,88 @@ from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
63
60
  RawImageType = Union[Image.Image, torch.Tensor]
64
61
 
65
62
 
63
+ # sin/cos positional embedding helpers are adapted from:
64
+ # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
65
+ def get_1d_sincos_pos_embed_from_grid(
66
+ embed_dim: int, pos: np.ndarray, version: Tuple[int, int] = (2, 0)
67
+ ) -> torch.Tensor:
68
+ """
69
+ embed_dim: output dimension for each position
70
+ pos: a list of positions to be encoded: size (M,) / (H, W)
71
+ out: (M, D) / (H, W, D)
72
+ """
73
+ assert embed_dim % 2 == 0
74
+ omega = np.arange(embed_dim // 2, dtype=np.float32)
75
+ omega /= embed_dim / 2.0
76
+ omega = 1.0 / 10000**omega # (D/2,)
77
+
78
+ if version == (2, 0):
79
+ pos = pos.reshape(-1) # (M,)
80
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
81
+ emb_sin = np.sin(out) # (M, D/2)
82
+ emb_cos = np.cos(out) # (M, D/2)
83
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
84
+ else:
85
+ out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product
86
+ emb_sin = np.sin(out) # (H, W, D/2)
87
+ emb_cos = np.cos(out) # (H, W, D/2)
88
+ emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D)
89
+ return emb
90
+
91
+
92
+ def get_2d_sincos_pos_embed_from_grid(
93
+ embed_dim: int, grid: np.ndarray, version: Tuple[int, int] = (2, 0)
94
+ ) -> torch.Tensor:
95
+ assert embed_dim % 2 == 0
96
+
97
+ # use half of dimensions to encode grid_h
98
+ emb_h = get_1d_sincos_pos_embed_from_grid(
99
+ embed_dim // 2, grid[0], version
100
+ ) # (H*W, D/2) or (H, W, D/2)
101
+ emb_w = get_1d_sincos_pos_embed_from_grid(
102
+ embed_dim // 2, grid[1], version
103
+ ) # (H*W, D/2) or (H, W, D/2)
104
+
105
+ if version == (2, 0):
106
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
107
+ else:
108
+ emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D)
109
+ return emb
110
+
111
+
112
+ def get_2d_sincos_pos_embed(
113
+ embed_dim: int,
114
+ grid_size: Union[int, Tuple[int, int]],
115
+ cls_token: bool = False,
116
+ version: Tuple[int, int] = (2, 0),
117
+ ) -> torch.Tensor:
118
+ """
119
+ grid_size: int of the grid height and width
120
+ return:
121
+ pos_embed: [grid_size*grid_size, embed_dim] or
122
+ [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
123
+ """
124
+ if isinstance(grid_size, int):
125
+ grid_h_size, grid_w_size = grid_size, grid_size
126
+ else:
127
+ grid_h_size, grid_w_size = grid_size[0], grid_size[1]
128
+
129
+ grid_h = np.arange(grid_h_size, dtype=np.float32)
130
+ grid_w = np.arange(grid_w_size, dtype=np.float32)
131
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
132
+ grid = np.stack(grid, axis=0)
133
+ assert isinstance(grid, np.ndarray) and grid.shape == (2, grid_h_size, grid_w_size)
134
+
135
+ if version == (2, 0):
136
+ grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
137
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
138
+ if cls_token:
139
+ pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
140
+ else:
141
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
142
+ return pos_embed
143
+
144
+
66
145
  class Idefics2VisionMLP(nn.Module):
67
146
 
68
147
  def __init__(
@@ -116,6 +195,10 @@ class Idefics2EncoderLayer(nn.Module):
116
195
  projection_size=config.intermediate_size,
117
196
  use_qkv_parallel=True,
118
197
  quant_config=quant_config,
198
+ dropout=config.attention_dropout,
199
+ use_context_forward=False,
200
+ use_full_precision_softmax=True,
201
+ flatten_batch=False,
119
202
  prefix=f"{prefix}.self_attn",
120
203
  )
121
204
  self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -126,7 +209,6 @@ class Idefics2EncoderLayer(nn.Module):
126
209
  self,
127
210
  hidden_states: torch.Tensor,
128
211
  cu_seqlens: torch.Tensor,
129
- forward_batch: ForwardBatch,
130
212
  ) -> torch.Tensor:
131
213
  """
132
214
  Args:
@@ -136,11 +218,8 @@ class Idefics2EncoderLayer(nn.Module):
136
218
  """
137
219
  residual = hidden_states
138
220
  hidden_states = self.layer_norm1(hidden_states)
139
- hidden_states = self.self_attn(
140
- hidden_states,
141
- cu_seqlens=cu_seqlens,
142
- # , forward_batch=forward_batch
143
- )
221
+ hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)
222
+
144
223
  hidden_states = residual + hidden_states
145
224
  residual = hidden_states
146
225
  hidden_states = self.layer_norm2(hidden_states)
@@ -181,7 +260,6 @@ class Idefics2Encoder(nn.Module):
181
260
  self,
182
261
  inputs_embeds: torch.Tensor,
183
262
  cu_seqlens: torch.Tensor,
184
- forward_batch: ForwardBatch,
185
263
  ) -> torch.Tensor:
186
264
  r"""
187
265
  Args:
@@ -195,7 +273,8 @@ class Idefics2Encoder(nn.Module):
195
273
  hidden_states = inputs_embeds
196
274
  for encoder_layer in self.layers:
197
275
  layer_outputs = encoder_layer(
198
- hidden_states, cu_seqlens=cu_seqlens, forward_batch=forward_batch
276
+ hidden_states,
277
+ cu_seqlens=cu_seqlens,
199
278
  )
200
279
  hidden_states = layer_outputs
201
280
  return hidden_states
@@ -232,19 +311,14 @@ class Idefics2VisionEmbeddings(nn.Module):
232
311
  self.num_positions = self.num_patches
233
312
  self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
234
313
 
235
- def forward(
314
+ def get_position_ids(
236
315
  self,
237
316
  pixel_values: torch.FloatTensor,
238
317
  patch_attention_mask: torch.BoolTensor,
239
318
  tgt_sizes: Optional[torch.IntTensor] = None,
240
- ) -> torch.Tensor:
319
+ ):
241
320
  batch_size, _, max_im_h, max_im_w = pixel_values.shape
242
- target_dtype = self.patch_embedding.weight.dtype
243
- pixel_values = pixel_values.to(
244
- device=self.patch_embedding.weight.device, dtype=target_dtype
245
- )
246
- patch_embeds = self.patch_embedding(pixel_values)
247
- embeddings = patch_embeds.flatten(2).transpose(1, 2)
321
+
248
322
  max_nb_patches_h, max_nb_patches_w = (
249
323
  max_im_h // self.patch_size,
250
324
  max_im_w // self.patch_size,
@@ -277,6 +351,24 @@ class Idefics2VisionEmbeddings(nn.Module):
277
351
  ).flatten()
278
352
  position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
279
353
  position_ids = position_ids.to(self.position_embedding.weight.device)
354
+ return position_ids
355
+
356
+ def forward(
357
+ self,
358
+ pixel_values: torch.FloatTensor,
359
+ patch_attention_mask: torch.BoolTensor,
360
+ tgt_sizes: Optional[torch.IntTensor] = None,
361
+ ) -> torch.Tensor:
362
+ target_dtype = self.patch_embedding.weight.dtype
363
+ pixel_values = pixel_values.to(
364
+ device=self.patch_embedding.weight.device, dtype=target_dtype
365
+ )
366
+ patch_embeds = self.patch_embedding(pixel_values)
367
+ embeddings = patch_embeds.flatten(2).transpose(1, 2)
368
+ position_ids = self.get_position_ids(
369
+ pixel_values, patch_attention_mask, tgt_sizes
370
+ )
371
+
280
372
  embeddings = embeddings + self.position_embedding(position_ids)
281
373
  return embeddings
282
374
 
@@ -287,7 +379,6 @@ class Idefics2VisionTransformer(nn.Module):
287
379
  self,
288
380
  config: PretrainedConfig,
289
381
  quant_config: Optional[QuantizationConfig] = None,
290
- prefix: str = "",
291
382
  ) -> None:
292
383
  super().__init__()
293
384
 
@@ -302,8 +393,6 @@ class Idefics2VisionTransformer(nn.Module):
302
393
 
303
394
  def compute_cu_seqlens(self, tgt_sizes: torch.Tensor) -> torch.Tensor:
304
395
  patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] # shape: (batch_size,)
305
-
306
- # 做 prefix sum 来得到 cu_seqlens,注意在最前面插一个 0 作为 offset
307
396
  cu_seqlens = torch.cat(
308
397
  [
309
398
  torch.tensor([0], device=patch_len.device, dtype=torch.int32),
@@ -316,19 +405,18 @@ class Idefics2VisionTransformer(nn.Module):
316
405
  def forward(
317
406
  self,
318
407
  pixel_values,
319
- forward_batch: ForwardBatch,
320
408
  patch_attention_mask: Optional[torch.BoolTensor] = None,
321
409
  tgt_sizes: Optional[torch.IntTensor] = None,
322
410
  ) -> torch.Tensor:
323
411
  hidden_states = self.embeddings(
324
412
  pixel_values=pixel_values,
325
413
  patch_attention_mask=patch_attention_mask,
326
- # forward_batch=forward_batch,
327
414
  tgt_sizes=tgt_sizes,
328
415
  )
329
416
  cu_seqlens = self.compute_cu_seqlens(tgt_sizes)
330
417
  encoder_outputs = self.encoder(
331
- hidden_states, cu_seqlens=cu_seqlens, forward_batch=forward_batch
418
+ hidden_states,
419
+ cu_seqlens=cu_seqlens,
332
420
  )
333
421
  last_hidden_state = self.post_layernorm(encoder_outputs)
334
422
  return last_hidden_state
@@ -573,14 +661,12 @@ class MiniCPMVBaseModel(nn.Module):
573
661
  config: PretrainedConfig,
574
662
  quant_config: Optional[QuantizationConfig] = None,
575
663
  ):
576
- # multimodal_config = config.model_config.multimodal_config
577
664
  super().__init__()
578
665
  # All MiniCPM-V models disable `tie_word_embeddings` but
579
666
  # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
580
- # check `tie_word_embeddings` until vLLM integrate MiniCPM-V model
667
+ # check `tie_word_embeddings` until SGLang integrate MiniCPM-V model
581
668
  # and config class
582
669
  self.config = config
583
- # self.multimodal_config = multimodal_config
584
670
 
585
671
  self.version = get_version_by_config(self.config)
586
672
  self.llm = self.init_llm(config=config, quant_config=quant_config)
@@ -598,13 +684,6 @@ class MiniCPMVBaseModel(nn.Module):
598
684
 
599
685
  self.logits_processor = LogitsProcessor(config)
600
686
 
601
- @cached_property
602
- def sampler(self):
603
- if hasattr(self.llm, "sampler"):
604
- return self.llm.sampler
605
-
606
- return get_sampler()
607
-
608
687
  def _get_image_bounds(
609
688
  self,
610
689
  input_ids: torch.Tensor,
@@ -666,7 +745,6 @@ class MiniCPMVBaseModel(nn.Module):
666
745
  self,
667
746
  input_ids: torch.Tensor,
668
747
  image_inputs: Optional[MiniCPMVImageInputs],
669
- forward_batch: ForwardBatch,
670
748
  ) -> Tuple[torch.Tensor, torch.Tensor]:
671
749
  vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
672
750
 
@@ -680,10 +758,7 @@ class MiniCPMVBaseModel(nn.Module):
680
758
  .to(vlm_embedding.device)
681
759
  )
682
760
  else:
683
- vision_hidden_states = self.get_vision_hidden_states(
684
- forward_batch, image_inputs
685
- )
686
-
761
+ vision_hidden_states = self.get_vision_hidden_states(image_inputs)
687
762
  # See NOTE in _parse_and_validate_inputs
688
763
  image_bounds = image_inputs["image_bounds"]
689
764
  if len(image_bounds) > 0:
@@ -693,6 +768,7 @@ class MiniCPMVBaseModel(nn.Module):
693
768
  for start, end in image_bounds.tolist()
694
769
  ]
695
770
  ).to(vlm_embedding.device)
771
+
696
772
  vlm_embedding.scatter_(
697
773
  0,
698
774
  image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
@@ -839,7 +915,7 @@ class MiniCPMVBaseModel(nn.Module):
839
915
  # There values are useless because their embeddings will be replaced by vision embeddings anyway.
840
916
  input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
841
917
 
842
- vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs, forward_batch)
918
+ vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
843
919
 
844
920
  # always pass the input via `inputs_embeds`
845
921
  # to make sure the computation graph is consistent
@@ -857,29 +933,6 @@ class MiniCPMVBaseModel(nn.Module):
857
933
  input_ids, hidden_states, self.llm.lm_head, forward_batch
858
934
  )
859
935
 
860
- def compute_logits(
861
- self,
862
- hidden_states: torch.Tensor,
863
- sampling_metadata: SamplingMetadata,
864
- ) -> Optional[torch.Tensor]:
865
- return self.llm.compute_logits(hidden_states, sampling_metadata)
866
-
867
- def sample(
868
- self,
869
- logits: torch.Tensor,
870
- sampling_metadata: SamplingMetadata,
871
- ) -> Optional[SamplerOutput]:
872
- next_tokens = self.sampler(logits, sampling_metadata)
873
- return next_tokens
874
-
875
- def get_mm_mapping(self) -> MultiModelKeys:
876
- """
877
- Get the module prefix in multimodal models
878
- """
879
- return MultiModelKeys.from_string_field(
880
- language_model="llm", connector="resampler", tower_model="vpm"
881
- )
882
-
883
936
  def init_llm(
884
937
  self,
885
938
  config: Qwen2Config,
@@ -910,9 +963,7 @@ class MiniCPMVBaseModel(nn.Module):
910
963
  ) -> torch.Tensor:
911
964
  raise NotImplementedError
912
965
 
913
- def get_vision_hidden_states(
914
- self, forward_batch: ForwardBatch, data: MiniCPMVImageInputs
915
- ) -> torch.Tensor:
966
+ def get_vision_hidden_states(self, data: MiniCPMVImageInputs) -> torch.Tensor:
916
967
  raise NotImplementedError
917
968
 
918
969
 
@@ -1019,7 +1070,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
1019
1070
 
1020
1071
  def get_vision_hidden_states(
1021
1072
  self,
1022
- forward_batch: ForwardBatch,
1023
1073
  data: MiniCPMVImageInputs,
1024
1074
  ) -> torch.Tensor:
1025
1075
  pixel_values = data["data"]
@@ -1042,15 +1092,18 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
1042
1092
  patch_attn_mask = torch.zeros(
1043
1093
  (B, 1, max_patches), dtype=torch.bool, device=device
1044
1094
  )
1045
- for i in range(B):
1046
- patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
1095
+
1096
+ tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
1097
+ mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
1098
+ patch_attn_mask[:, 0, :] = torch.arange(
1099
+ patch_attn_mask.size(2), device=patch_attn_mask.device
1100
+ ).unsqueeze(0) < mask_shapes.unsqueeze(1)
1101
+
1047
1102
  vision_embedding = self.vpm(
1048
1103
  all_pixel_values.type(dtype),
1049
- forward_batch=forward_batch,
1050
1104
  patch_attention_mask=patch_attn_mask,
1051
1105
  tgt_sizes=tgt_sizes,
1052
1106
  )
1053
-
1054
1107
  return self.resampler(vision_embedding, tgt_sizes)
1055
1108
 
1056
1109
  def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
@@ -1138,7 +1191,7 @@ class MiniCPMV:
1138
1191
  """
1139
1192
  Different versions of MiniCPMV use different visual encoders and LLMs,
1140
1193
  which is not conducive to the current integration logic of LoRA and
1141
- bitsandbytes in vLLM. Therefore, it is necessary to separate them.
1194
+ bitsandbytes in SGLang. Therefore, it is necessary to separate them.
1142
1195
  """
1143
1196
 
1144
1197
  # Ensure that the LoRA support check passes when the class is not
@@ -17,6 +17,7 @@ from transformers.models.mllama.modeling_mllama import (
17
17
  import sglang.srt.distributed.parallel_state as ps
18
18
  from sglang.srt.distributed import get_tensor_model_parallel_world_size
19
19
  from sglang.srt.layers.activation import get_act_fn
20
+ from sglang.srt.layers.attention.vision import VisionAttention
20
21
  from sglang.srt.layers.layernorm import RMSNorm
21
22
  from sglang.srt.layers.linear import (
22
23
  ColumnParallelLinear,
@@ -145,61 +146,6 @@ class MllamaPrecomputedPositionEmbedding(nn.Module):
145
146
  return hidden_state
146
147
 
147
148
 
148
- class MllamaVisionSdpaAttention(nn.Module):
149
- def __init__(self, config: config_mllama.MllamaVisionConfig):
150
- super().__init__()
151
-
152
- model_parallel_size = get_tensor_model_parallel_world_size()
153
- self.embed_dim = config.hidden_size
154
- self.num_heads = config.attention_heads
155
- self.head_dim = config.hidden_size // config.attention_heads
156
- self.num_local_heads = self.num_heads // model_parallel_size
157
- self.q_size = self.num_local_heads * self.head_dim
158
- self.kv_size = self.num_local_heads * self.head_dim
159
-
160
- self.qkv_proj = QKVParallelLinear(
161
- self.embed_dim,
162
- self.head_dim,
163
- self.num_heads,
164
- bias=False,
165
- )
166
- self.o_proj = RowParallelLinear(
167
- self.num_heads * self.head_dim,
168
- self.embed_dim,
169
- bias=False,
170
- input_is_parallel=True,
171
- )
172
-
173
- def forward(
174
- self,
175
- hidden_state: torch.Tensor,
176
- attention_mask: Optional[torch.Tensor] = None,
177
- ) -> torch.Tensor:
178
- qkv, _ = self.qkv_proj(hidden_state)
179
- q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
180
- q = q.view(
181
- q.shape[0], q.shape[1], self.num_local_heads, self.head_dim
182
- ).transpose(1, 2)
183
- k = k.view(
184
- k.shape[0], k.shape[1], self.num_local_heads, self.head_dim
185
- ).transpose(1, 2)
186
- v = v.view(
187
- v.shape[0], v.shape[1], self.num_local_heads, self.head_dim
188
- ).transpose(1, 2)
189
-
190
- # TODO: remove padding in image encoder
191
- attn_output = F.scaled_dot_product_attention(
192
- q, k, v, attn_mask=attention_mask, dropout_p=0.0
193
- )
194
-
195
- attn_output = attn_output.transpose(1, 2).contiguous()
196
- attn_output = attn_output.reshape(
197
- attn_output.shape[0], attn_output.shape[1], -1
198
- )
199
- output, _ = self.o_proj(attn_output)
200
- return output
201
-
202
-
203
149
  class MllamaVisionMLP(nn.Module):
204
150
  def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
205
151
  super().__init__()
@@ -237,7 +183,17 @@ class MllamaVisionEncoderLayer(nn.Module):
237
183
  self.is_gated = is_gated
238
184
  self.intermediate_size = config.intermediate_size
239
185
 
240
- self.self_attn = MllamaVisionSdpaAttention(config)
186
+ self.self_attn = VisionAttention(
187
+ self.hidden_size,
188
+ self.num_attention_heads,
189
+ self.hidden_size,
190
+ use_qkv_parallel=True,
191
+ quant_config=None,
192
+ dropout=0.0,
193
+ use_context_forward=False,
194
+ use_full_precision_softmax=False,
195
+ flatten_batch=False,
196
+ )
241
197
  self.mlp = MllamaVisionMLP(config)
242
198
 
243
199
  self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps)
@@ -992,6 +948,10 @@ class MllamaForConditionalGeneration(nn.Module):
992
948
  weight_loader(param, loaded_weight, shard_id)
993
949
  break
994
950
  else:
951
+ if "vision_model" in name:
952
+ # adapt to VisionAttention
953
+ name = name.replace("self_attn.o_proj", "self_attn.proj")
954
+
995
955
  param = params_dict.pop(name)
996
956
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
997
957
  weight_loader(param, loaded_weight)
@@ -249,7 +249,10 @@ class Qwen2Model(nn.Module):
249
249
  self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
250
250
 
251
251
  def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
252
- return self.embed_tokens(input_ids)
252
+ if hasattr(self.config, "scale_emb"):
253
+ return self.embed_tokens(input_ids) * self.config.scale_emb
254
+ else:
255
+ return self.embed_tokens(input_ids)
253
256
 
254
257
  def forward(
255
258
  self,
@@ -30,13 +30,11 @@ import numpy as np
30
30
  import torch
31
31
  import torch.nn as nn
32
32
  import torch.nn.functional as F
33
- from einops import rearrange, repeat
34
- from vllm.model_executor.layers.activation import QuickGELU
33
+ from einops import rearrange
35
34
 
36
35
  from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
37
- from sglang.srt.distributed import parallel_state
38
- from sglang.srt.distributed import utils as dist_utils
39
36
  from sglang.srt.hf_transformers_utils import get_processor
37
+ from sglang.srt.layers.activation import QuickGELU
40
38
  from sglang.srt.layers.attention.vision import VisionAttention
41
39
  from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
42
40
  from sglang.srt.layers.logits_processor import LogitsProcessor
@@ -118,6 +116,7 @@ class Qwen2VisionBlock(nn.Module):
118
116
  mlp_ratio: float,
119
117
  act_layer: Type[nn.Module] = QuickGELU,
120
118
  norm_layer: Type[nn.Module] = None,
119
+ attn_implementation: Optional[str] = "sdpa",
121
120
  quant_config: Optional[QuantizationConfig] = None,
122
121
  ) -> None:
123
122
  super().__init__()
@@ -126,12 +125,24 @@ class Qwen2VisionBlock(nn.Module):
126
125
  self.norm1 = norm_layer(dim)
127
126
  self.norm2 = norm_layer(dim)
128
127
  mlp_hidden_dim = int(dim * mlp_ratio)
128
+ if attn_implementation == "sdpa":
129
+ use_context_forward = False
130
+ use_full_precision_softmax = False
131
+ elif attn_implementation == "flash_attention_2":
132
+ use_full_precision_softmax = False
133
+ use_context_forward = True
134
+ elif attn_implementation == "eager":
135
+ use_full_precision_softmax = True
136
+ use_context_forward = False
129
137
 
130
138
  self.attn = VisionAttention(
131
139
  embed_dim=dim,
132
140
  num_heads=num_heads,
133
141
  projection_size=dim,
134
142
  use_qkv_parallel=False,
143
+ use_context_forward=use_context_forward,
144
+ use_full_precision_softmax=use_full_precision_softmax,
145
+ flatten_batch=True,
135
146
  quant_config=quant_config,
136
147
  )
137
148
  self.mlp = Qwen2VisionMLP(
@@ -286,7 +297,6 @@ class Qwen2VisionTransformer(nn.Module):
286
297
  norm_layer = partial(nn.LayerNorm, eps=norm_eps)
287
298
  head_dim = embed_dim // num_heads
288
299
  self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
289
-
290
300
  self.blocks = nn.ModuleList(
291
301
  [
292
302
  Qwen2VisionBlock(
@@ -294,6 +304,7 @@ class Qwen2VisionTransformer(nn.Module):
294
304
  num_heads=num_heads,
295
305
  mlp_ratio=mlp_ratio,
296
306
  norm_layer=norm_layer,
307
+ attn_implementation="sdpa",
297
308
  quant_config=quant_config,
298
309
  )
299
310
  for _ in range(depth)
@@ -482,10 +493,6 @@ class Qwen2VLForConditionalGeneration(nn.Module):
482
493
  opensource models), the shape will be `(3, seq_len)`,
483
494
  otherwise it will be `(seq_len,).
484
495
  (Use input_metadata.mrope_positions to replace it)
485
- pixel_values: Pixel values to be fed to a model.
486
- `None` if no images are passed.
487
- image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
488
- `None` if no images are passed.
489
496
  """
490
497
  if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
491
498
  positions = forward_batch.mrope_positions
@@ -540,15 +547,18 @@ class Qwen2VLForConditionalGeneration(nn.Module):
540
547
  num_image_tokens = self.calculate_num_image_tokens(
541
548
  image_grid_thws[idx]
542
549
  )
550
+
543
551
  left_idx = start_idx + (image_offset - prefix_len)
544
552
  right_idx = (
545
553
  start_idx + (image_offset - prefix_len) + num_image_tokens
546
554
  )
555
+
547
556
  inputs_embeds[left_idx:right_idx] = image_embeds[
548
557
  image_embeds_offset : image_embeds_offset + num_image_tokens
549
558
  ]
550
559
  image_embeds_offset += num_image_tokens
551
560
 
561
+ input_ids = None
552
562
  hidden_states = self.model(
553
563
  input_ids=input_ids,
554
564
  positions=positions,
sglang/srt/server_args.py CHANGED
@@ -113,6 +113,7 @@ class ServerArgs:
113
113
  # LoRA
114
114
  lora_paths: Optional[List[str]] = None
115
115
  max_loras_per_batch: int = 8
116
+ lora_backend: str = "triton"
116
117
 
117
118
  # Kernel backend
118
119
  attention_backend: Optional[str] = None
@@ -163,6 +164,7 @@ class ServerArgs:
163
164
  # Custom logit processor
164
165
  enable_custom_logit_processor: bool = False
165
166
  tool_call_parser: str = None
167
+ enable_hierarchical_cache: bool = False
166
168
 
167
169
  def __post_init__(self):
168
170
  # Set missing default values
@@ -272,6 +274,10 @@ class ServerArgs:
272
274
  ) and check_gguf_file(self.model_path):
273
275
  self.quantization = self.load_format = "gguf"
274
276
 
277
+ # AMD-specific Triton attention KV splits default number
278
+ if is_hip():
279
+ self.triton_attention_num_kv_splits = 16
280
+
275
281
  @staticmethod
276
282
  def add_cli_args(parser: argparse.ArgumentParser):
277
283
  # Model and port args
@@ -648,13 +654,19 @@ class ServerArgs:
648
654
  nargs="*",
649
655
  default=None,
650
656
  action=LoRAPathAction,
651
- help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}",
657
+ help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}.",
652
658
  )
653
659
  parser.add_argument(
654
660
  "--max-loras-per-batch",
655
661
  type=int,
656
662
  default=8,
657
- help="Maximum number of adapters for a running batch, include base-only request",
663
+ help="Maximum number of adapters for a running batch, include base-only request.",
664
+ )
665
+ parser.add_argument(
666
+ "--lora-backend",
667
+ type=str,
668
+ default="triton",
669
+ help="Choose the kernel backend for multi-LoRA serving.",
658
670
  )
659
671
 
660
672
  # Kernel backend
@@ -892,6 +904,11 @@ class ServerArgs:
892
904
  default=ServerArgs.tool_call_parser,
893
905
  help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', and 'llama3'.",
894
906
  )
907
+ parser.add_argument(
908
+ "--enable-hierarchical-cache",
909
+ action="store_true",
910
+ help="Enable hierarchical cache",
911
+ )
895
912
 
896
913
  @classmethod
897
914
  def from_cli_args(cls, args: argparse.Namespace):
@@ -79,11 +79,13 @@ __global__ void build_tree(Tensor<long, 2> parent_list, Tensor<long, 2> selected
79
79
  )
80
80
 
81
81
 
82
- def build_tree_kernel(parent_list, top_score_index, seq_lens, topk, depth, draft_token):
82
+ def build_tree_kernel(
83
+ parent_list, top_score_index, seq_lens, seq_lens_sum, topk, depth, draft_token
84
+ ):
83
85
  bs = seq_lens.numel()
84
86
  device = parent_list.device
85
87
  tree_mask = torch.full(
86
- (torch.sum(seq_lens).item() * draft_token + draft_token * draft_token * bs,),
88
+ (seq_lens_sum * draft_token + draft_token * draft_token * bs,),
87
89
  True,
88
90
  device=device,
89
91
  )