sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +26 -4
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +676 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +49 -8
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  34. sglang/srt/distributed/parallel_state.py +42 -8
  35. sglang/srt/entrypoints/engine.py +55 -5
  36. sglang/srt/entrypoints/http_server.py +78 -13
  37. sglang/srt/entrypoints/verl_engine.py +2 -0
  38. sglang/srt/function_call_parser.py +133 -55
  39. sglang/srt/hf_transformers_utils.py +28 -3
  40. sglang/srt/layers/activation.py +4 -2
  41. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  42. sglang/srt/layers/attention/flashattention_backend.py +434 -0
  43. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  44. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  45. sglang/srt/layers/attention/triton_backend.py +171 -38
  46. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  47. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  48. sglang/srt/layers/attention/utils.py +53 -0
  49. sglang/srt/layers/attention/vision.py +9 -28
  50. sglang/srt/layers/dp_attention.py +41 -19
  51. sglang/srt/layers/layernorm.py +24 -2
  52. sglang/srt/layers/linear.py +17 -5
  53. sglang/srt/layers/logits_processor.py +25 -7
  54. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  55. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  56. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  57. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  63. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  64. sglang/srt/layers/moe/topk.py +60 -20
  65. sglang/srt/layers/parameter.py +1 -1
  66. sglang/srt/layers/quantization/__init__.py +80 -53
  67. sglang/srt/layers/quantization/awq.py +200 -0
  68. sglang/srt/layers/quantization/base_config.py +5 -0
  69. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  70. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  71. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  72. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  73. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  74. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  75. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  76. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  77. sglang/srt/layers/quantization/fp8.py +76 -34
  78. sglang/srt/layers/quantization/fp8_kernel.py +25 -8
  79. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  80. sglang/srt/layers/quantization/gptq.py +36 -19
  81. sglang/srt/layers/quantization/kv_cache.py +98 -0
  82. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  83. sglang/srt/layers/quantization/utils.py +153 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  85. sglang/srt/layers/rotary_embedding.py +78 -87
  86. sglang/srt/layers/sampler.py +1 -1
  87. sglang/srt/lora/backend/base_backend.py +4 -4
  88. sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  89. sglang/srt/lora/backend/triton_backend.py +5 -8
  90. sglang/srt/lora/layers.py +87 -33
  91. sglang/srt/lora/lora.py +2 -22
  92. sglang/srt/lora/lora_manager.py +67 -30
  93. sglang/srt/lora/mem_pool.py +117 -52
  94. sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  95. sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  96. sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  97. sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  98. sglang/srt/lora/utils.py +18 -1
  99. sglang/srt/managers/cache_controller.py +2 -5
  100. sglang/srt/managers/data_parallel_controller.py +30 -8
  101. sglang/srt/managers/expert_distribution.py +81 -0
  102. sglang/srt/managers/io_struct.py +43 -5
  103. sglang/srt/managers/mm_utils.py +373 -0
  104. sglang/srt/managers/multimodal_processor.py +68 -0
  105. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  106. sglang/srt/managers/multimodal_processors/clip.py +63 -0
  107. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  108. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  109. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  110. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  111. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  112. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  113. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  114. sglang/srt/managers/schedule_batch.py +134 -30
  115. sglang/srt/managers/scheduler.py +290 -31
  116. sglang/srt/managers/session_controller.py +1 -1
  117. sglang/srt/managers/tokenizer_manager.py +59 -24
  118. sglang/srt/managers/tp_worker.py +4 -1
  119. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  120. sglang/srt/managers/utils.py +6 -1
  121. sglang/srt/mem_cache/hiradix_cache.py +18 -7
  122. sglang/srt/mem_cache/memory_pool.py +255 -98
  123. sglang/srt/mem_cache/paged_allocator.py +2 -2
  124. sglang/srt/mem_cache/radix_cache.py +4 -4
  125. sglang/srt/model_executor/cuda_graph_runner.py +36 -21
  126. sglang/srt/model_executor/forward_batch_info.py +68 -11
  127. sglang/srt/model_executor/model_runner.py +75 -8
  128. sglang/srt/model_loader/loader.py +171 -3
  129. sglang/srt/model_loader/weight_utils.py +51 -3
  130. sglang/srt/models/clip.py +563 -0
  131. sglang/srt/models/deepseek_janus_pro.py +31 -88
  132. sglang/srt/models/deepseek_nextn.py +22 -10
  133. sglang/srt/models/deepseek_v2.py +329 -73
  134. sglang/srt/models/deepseek_vl2.py +358 -0
  135. sglang/srt/models/gemma3_causal.py +694 -0
  136. sglang/srt/models/gemma3_mm.py +468 -0
  137. sglang/srt/models/llama.py +47 -7
  138. sglang/srt/models/llama_eagle.py +1 -0
  139. sglang/srt/models/llama_eagle3.py +196 -0
  140. sglang/srt/models/llava.py +3 -3
  141. sglang/srt/models/llavavid.py +3 -3
  142. sglang/srt/models/minicpmo.py +1995 -0
  143. sglang/srt/models/minicpmv.py +62 -137
  144. sglang/srt/models/mllama.py +4 -4
  145. sglang/srt/models/phi3_small.py +1 -1
  146. sglang/srt/models/qwen2.py +3 -0
  147. sglang/srt/models/qwen2_5_vl.py +68 -146
  148. sglang/srt/models/qwen2_classification.py +75 -0
  149. sglang/srt/models/qwen2_moe.py +9 -1
  150. sglang/srt/models/qwen2_vl.py +25 -63
  151. sglang/srt/openai_api/adapter.py +201 -104
  152. sglang/srt/openai_api/protocol.py +33 -7
  153. sglang/srt/patch_torch.py +71 -0
  154. sglang/srt/sampling/sampling_batch_info.py +1 -1
  155. sglang/srt/sampling/sampling_params.py +6 -6
  156. sglang/srt/server_args.py +114 -14
  157. sglang/srt/speculative/build_eagle_tree.py +7 -347
  158. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  159. sglang/srt/speculative/eagle_utils.py +208 -252
  160. sglang/srt/speculative/eagle_worker.py +140 -54
  161. sglang/srt/speculative/spec_info.py +6 -1
  162. sglang/srt/torch_memory_saver_adapter.py +22 -0
  163. sglang/srt/utils.py +215 -21
  164. sglang/test/__init__.py +0 -0
  165. sglang/test/attention/__init__.py +0 -0
  166. sglang/test/attention/test_flashattn_backend.py +312 -0
  167. sglang/test/runners.py +29 -2
  168. sglang/test/test_activation.py +2 -1
  169. sglang/test/test_block_fp8.py +5 -4
  170. sglang/test/test_block_fp8_ep.py +2 -1
  171. sglang/test/test_dynamic_grad_mode.py +58 -0
  172. sglang/test/test_layernorm.py +3 -2
  173. sglang/test/test_utils.py +56 -5
  174. sglang/utils.py +31 -0
  175. sglang/version.py +1 -1
  176. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
  177. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
  178. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
  179. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  180. sglang/srt/managers/image_processor.py +0 -55
  181. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  182. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  183. sglang/srt/managers/multi_modality_padding.py +0 -134
  184. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
  185. {sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,676 @@
1
+ import math
2
+ import os
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List, Optional, Tuple
5
+
6
+ import torch
7
+ from PIL import Image, ImageOps
8
+ from transformers import (
9
+ AutoProcessor,
10
+ LlamaTokenizerFast,
11
+ PretrainedConfig,
12
+ ProcessorMixin,
13
+ )
14
+
15
+
16
+ def select_best_resolution(image_size, candidate_resolutions):
17
+ # used for cropping
18
+ original_width, original_height = image_size
19
+ best_fit = None
20
+ max_effective_resolution = 0
21
+ min_wasted_resolution = float("inf")
22
+
23
+ for width, height in candidate_resolutions:
24
+ scale = min(width / original_width, height / original_height)
25
+ downscaled_width, downscaled_height = int(original_width * scale), int(
26
+ original_height * scale
27
+ )
28
+ effective_resolution = min(
29
+ downscaled_width * downscaled_height, original_width * original_height
30
+ )
31
+ wasted_resolution = (width * height) - effective_resolution
32
+
33
+ if effective_resolution > max_effective_resolution or (
34
+ effective_resolution == max_effective_resolution
35
+ and wasted_resolution < min_wasted_resolution
36
+ ):
37
+ max_effective_resolution = effective_resolution
38
+ min_wasted_resolution = wasted_resolution
39
+ best_fit = (width, height)
40
+
41
+ return best_fit
42
+
43
+
44
+ class DictOutput(object):
45
+ def keys(self):
46
+ return self.__dict__.keys()
47
+
48
+ def __getitem__(self, item):
49
+ return self.__dict__[item]
50
+
51
+ def __setitem__(self, key, value):
52
+ self.__dict__[key] = value
53
+
54
+
55
+ @dataclass
56
+ class VLChatProcessorOutput(DictOutput):
57
+ input_ids: torch.LongTensor
58
+ target_ids: torch.LongTensor
59
+ images: torch.Tensor
60
+ images_seq_mask: torch.BoolTensor
61
+ images_spatial_crop: torch.LongTensor
62
+
63
+ def __len__(self):
64
+ return len(self.input_ids)
65
+
66
+
67
+ class ImageTransform(object):
68
+ def __init__(
69
+ self,
70
+ mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
71
+ std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
72
+ normalize: bool = True,
73
+ ):
74
+ self.mean = mean
75
+ self.std = std
76
+ self.normalize = normalize
77
+
78
+ # only load torchvision.transforms when needed
79
+ try:
80
+ import torchvision.transforms as T
81
+
82
+ # FIXME: add version check for gguf
83
+ except ImportError as err:
84
+ raise ImportError(
85
+ "Please install torchvision via `pip install torchvision` to use Deepseek-VL2."
86
+ ) from err
87
+
88
+ transform_pipelines = [T.ToTensor()]
89
+
90
+ if normalize:
91
+ transform_pipelines.append(T.Normalize(mean, std))
92
+
93
+ self.transform = T.Compose(transform_pipelines)
94
+
95
+ def __call__(self, pil_img: Image.Image):
96
+ x = self.transform(pil_img)
97
+ return x
98
+
99
+
100
+ class DeepseekVLV2Processor(ProcessorMixin):
101
+ tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
102
+ attributes = ["tokenizer"]
103
+
104
+ def __init__(
105
+ self,
106
+ tokenizer: LlamaTokenizerFast,
107
+ candidate_resolutions: Tuple[Tuple[int, int]],
108
+ patch_size: int,
109
+ downsample_ratio: int,
110
+ image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
111
+ image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
112
+ normalize: bool = True,
113
+ image_token: str = "<image>",
114
+ pad_token: str = "<|▁pad▁|>",
115
+ add_special_token: bool = False,
116
+ sft_format: str = "deepseek",
117
+ mask_prompt: bool = True,
118
+ ignore_id: int = -100,
119
+ **kwargs,
120
+ ):
121
+
122
+ self.candidate_resolutions = candidate_resolutions
123
+ self.image_size = candidate_resolutions[0][0]
124
+ self.patch_size = patch_size
125
+ self.image_mean = image_mean
126
+ self.image_std = image_std
127
+ self.normalize = normalize
128
+ self.downsample_ratio = downsample_ratio
129
+
130
+ self.image_transform = ImageTransform(
131
+ mean=image_mean, std=image_std, normalize=normalize
132
+ )
133
+ self.tokenizer = tokenizer
134
+ # must set this,padding side with make a difference in batch inference
135
+ self.tokenizer.padding_side = "left"
136
+
137
+ # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
138
+ if tokenizer.pad_token is None:
139
+ self.tokenizer.add_special_tokens({"pad_token": pad_token})
140
+
141
+ # add image token
142
+ image_token_id = self.tokenizer.vocab.get(image_token)
143
+ if image_token_id is None:
144
+ special_tokens = [image_token]
145
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
146
+ self.tokenizer.add_special_tokens(special_tokens_dict)
147
+ self.image_token_id = self.tokenizer.vocab.get(image_token)
148
+
149
+ # add five special tokens for grounding-related tasks
150
+ # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
151
+ special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
152
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
153
+ self.tokenizer.add_special_tokens(special_tokens_dict)
154
+
155
+ # add special tokens for SFT data
156
+ special_tokens = ["<|User|>", "<|Assistant|>"]
157
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
158
+ self.tokenizer.add_special_tokens(special_tokens_dict)
159
+
160
+ self.image_token = image_token
161
+ self.pad_token = pad_token
162
+ self.add_special_token = add_special_token
163
+ self.sft_format = sft_format
164
+ self.mask_prompt = mask_prompt
165
+ self.ignore_id = ignore_id
166
+
167
+ super().__init__(
168
+ tokenizer,
169
+ **kwargs,
170
+ )
171
+
172
+ def format_messages_v2(self, messages, pil_images, max_req_input_len=-1):
173
+ """play the role of format_messages_v2 and get_images_info in the last version"""
174
+ tokenized_data = []
175
+ masked_tokenized_data = [] # labels
176
+ images_list = []
177
+ images_seq_mask = []
178
+ images_spatial_crop = []
179
+
180
+ image_index = 0
181
+ image_token_cnt = messages.count(self.image_token)
182
+ tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
183
+ messages,
184
+ pil_images[image_index : image_index + image_token_cnt],
185
+ bos=False,
186
+ eos=True,
187
+ cropping=len(pil_images) <= 2,
188
+ max_req_input_len=max_req_input_len,
189
+ )
190
+
191
+ image_index = image_token_cnt
192
+ tokenized_data += tokenized_str
193
+ if self.mask_prompt:
194
+ masked_tokenized_data += [self.ignore_id] * len(tokenized_str)
195
+ else:
196
+ masked_tokenized_data += tokenized_str
197
+ images_list += images
198
+ images_seq_mask += seq_mask
199
+ images_spatial_crop += spatial_crop
200
+
201
+ assert len(tokenized_data) == len(
202
+ images_seq_mask
203
+ ), f"format_messages_v2: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
204
+
205
+ return (
206
+ tokenized_data,
207
+ masked_tokenized_data,
208
+ images_list,
209
+ images_seq_mask,
210
+ images_spatial_crop,
211
+ )
212
+
213
+ @property
214
+ def bos_id(self):
215
+ return self.tokenizer.bos_token_id
216
+
217
+ @property
218
+ def eos_id(self):
219
+ return self.tokenizer.eos_token_id
220
+
221
+ @property
222
+ def pad_id(self):
223
+ return self.tokenizer.pad_token_id
224
+
225
+ def encode(self, text: str, bos: bool = True, eos: bool = False):
226
+ t = self.tokenizer.encode(text, add_special_tokens=False)
227
+
228
+ if bos:
229
+ t = [self.bos_id] + t
230
+ if eos:
231
+ t = t + [self.eos_id]
232
+
233
+ return t
234
+
235
+ def decode(self, t: List[int], **kwargs) -> str:
236
+ return self.tokenizer.decode(t, **kwargs)
237
+
238
+ def process_one(
239
+ self,
240
+ prompt: str = None,
241
+ conversations: List[Dict[str, str]] = None,
242
+ images: List[Image.Image] = None,
243
+ apply_sft_format: bool = False,
244
+ inference_mode: bool = True,
245
+ system_prompt: str = "",
246
+ max_req_input_len: int = -1,
247
+ **kwargs,
248
+ ):
249
+ """
250
+
251
+ Args:
252
+ prompt (str): the formatted prompt;
253
+ conversations (List[Dict]): conversations with a list of messages;
254
+ images (List[ImageType]): the list of images;
255
+ apply_sft_format (bool): if prompt is not None, then apply the SFT format to prompt;
256
+ if conversations is not None, then it will always apply the SFT format to conversations;
257
+ inference_mode (bool): if True, then remove the last eos token;
258
+ system_prompt (str): the system prompt;
259
+ **kwargs:
260
+
261
+ Returns:
262
+ outputs (BaseProcessorOutput): the output of the processor,
263
+ - input_ids (torch.LongTensor): [N + image tokens]
264
+ - target_ids (torch.LongTensor): [N + image tokens]
265
+ - images (torch.FloatTensor): [n_images, 3, H, W]
266
+ - image_id (int): the id of the image token
267
+ - num_image_tokens (List[int]): the number of image tokens
268
+ """
269
+
270
+ assert (
271
+ prompt is None or conversations is None
272
+ ), "prompt and conversations cannot be used at the same time."
273
+
274
+ (
275
+ tokenized_str,
276
+ masked_tokenized_str,
277
+ images_list,
278
+ images_seq_mask,
279
+ images_spatial_crop,
280
+ ) = self.format_messages_v2(conversations, images, max_req_input_len)
281
+
282
+ assert (
283
+ len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
284
+ ), (
285
+ f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
286
+ f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
287
+ )
288
+
289
+ input_ids = torch.LongTensor(tokenized_str)
290
+ target_ids = torch.LongTensor(masked_tokenized_str)
291
+ images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
292
+
293
+ # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
294
+ target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
295
+ self.ignore_id
296
+ )
297
+ input_ids[input_ids < 0] = self.pad_id
298
+
299
+ if inference_mode:
300
+ assert input_ids[-1] == self.eos_id
301
+ input_ids = input_ids[:-1]
302
+ target_ids = target_ids[:-1]
303
+ images_seq_mask = images_seq_mask[:-1]
304
+
305
+ if len(images_list) == 0:
306
+ images = torch.zeros((1, 3, self.image_size, self.image_size))
307
+ images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
308
+ else:
309
+ images = torch.stack(images_list, dim=0)
310
+ images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
311
+
312
+ prepare = VLChatProcessorOutput(
313
+ input_ids=input_ids,
314
+ target_ids=target_ids,
315
+ images=images,
316
+ images_seq_mask=images_seq_mask,
317
+ images_spatial_crop=images_spatial_crop,
318
+ )
319
+
320
+ return prepare
321
+
322
+ def __call__(
323
+ self,
324
+ *,
325
+ prompt: str = None,
326
+ conversations: List[Dict[str, str]] = None,
327
+ images: List[Image.Image] = None,
328
+ apply_sft_format: bool = False,
329
+ inference_mode: bool = True,
330
+ system_prompt: str = "",
331
+ max_req_input_len: int = -1,
332
+ **kwargs,
333
+ ):
334
+ prepare = self.process_one(
335
+ prompt=prompt,
336
+ conversations=conversations,
337
+ images=images,
338
+ apply_sft_format=apply_sft_format,
339
+ inference_mode=inference_mode,
340
+ system_prompt=system_prompt,
341
+ max_req_input_len=max_req_input_len,
342
+ )
343
+
344
+ return prepare
345
+
346
+ def find_all_indices(self, messages, target_value):
347
+ indices = []
348
+ for index, item in enumerate(messages):
349
+ if item == target_value:
350
+ indices.append(index)
351
+ return indices
352
+
353
+ def tokenize_with_images(
354
+ self,
355
+ conversation: str,
356
+ images: List[Image.Image],
357
+ bos: bool = True,
358
+ eos: bool = True,
359
+ cropping: bool = True,
360
+ max_req_input_len: int = -1,
361
+ ):
362
+ """Tokenize text with <image> tags."""
363
+ images_list, images_seq_mask, images_spatial_crop = [], [], []
364
+ text_splits = conversation.split(self.image_token)
365
+ tokenized_str = []
366
+ for text_sep, image in zip(text_splits, images):
367
+ """encode text_sep"""
368
+ tokenized_sep = self.encode(text_sep, bos=False, eos=False)
369
+ tokenized_str += tokenized_sep
370
+ images_seq_mask += [False] * len(tokenized_sep)
371
+
372
+ """select best resolution for anyres"""
373
+ if cropping:
374
+ best_width, best_height = select_best_resolution(
375
+ image.size, self.candidate_resolutions
376
+ )
377
+ else:
378
+ best_width, best_height = self.image_size, self.image_size
379
+ # print(image.size, (best_width, best_height)) # check the select_best_resolutions func
380
+
381
+ """process the global view"""
382
+ global_view = ImageOps.pad(
383
+ image,
384
+ (self.image_size, self.image_size),
385
+ color=tuple(int(x * 255) for x in self.image_transform.mean),
386
+ )
387
+ images_list.append(self.image_transform(global_view))
388
+
389
+ """process the local views"""
390
+ local_view = ImageOps.pad(
391
+ image,
392
+ (best_width, best_height),
393
+ color=tuple(int(x * 255) for x in self.image_transform.mean),
394
+ )
395
+ for i in range(0, best_height, self.image_size):
396
+ for j in range(0, best_width, self.image_size):
397
+ images_list.append(
398
+ self.image_transform(
399
+ local_view.crop(
400
+ (j, i, j + self.image_size, i + self.image_size)
401
+ )
402
+ )
403
+ )
404
+
405
+ """record height / width crop num"""
406
+ num_width_tiles, num_height_tiles = (
407
+ best_width // self.image_size,
408
+ best_height // self.image_size,
409
+ )
410
+ images_spatial_crop.append([num_width_tiles, num_height_tiles])
411
+
412
+ """add image tokens"""
413
+ h = w = math.ceil(
414
+ (self.image_size // self.patch_size) / self.downsample_ratio
415
+ )
416
+ # global views tokens h * (w + 1), 1 is for line seperator
417
+ tokenized_image = [self.image_token_id] * h * (w + 1)
418
+ # add a seperator between global and local views
419
+ tokenized_image += [self.image_token_id]
420
+ # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
421
+ tokenized_image += (
422
+ [self.image_token_id]
423
+ * (num_height_tiles * h)
424
+ * (num_width_tiles * w + 1)
425
+ )
426
+
427
+ tokenized_str += tokenized_image
428
+ images_seq_mask += [True] * len(tokenized_image)
429
+ # print(width_crop_num, height_crop_num, len(tokenized_image)) # test the correctness of the number of image-related tokens
430
+
431
+ """process the last text split"""
432
+ tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
433
+ # deal with video, limit with request len
434
+ if max_req_input_len > -1:
435
+ if max_req_input_len < len(tokenized_sep) + len(tokenized_str) - 1:
436
+ rest = max_req_input_len - len(tokenized_sep) - 1 - 1024
437
+ tokenized_str = tokenized_str[:rest]
438
+ images_seq_mask = images_seq_mask[:rest]
439
+ tokenized_str += tokenized_sep
440
+ images_seq_mask += [False] * len(tokenized_sep)
441
+
442
+ """add the bos and eos tokens"""
443
+ if bos:
444
+ tokenized_str = [self.bos_id] + tokenized_str
445
+ images_seq_mask = [False] + images_seq_mask
446
+ if eos:
447
+ tokenized_str = tokenized_str + [self.eos_id]
448
+ images_seq_mask = images_seq_mask + [False]
449
+
450
+ assert len(tokenized_str) == len(
451
+ images_seq_mask
452
+ ), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
453
+
454
+ return tokenized_str, images_list, images_seq_mask, images_spatial_crop
455
+
456
+
457
+ class DeepseekVL2VisionEncoderConfig(PretrainedConfig):
458
+ model_type: str = "vision"
459
+
460
+ model_name: str = "siglip_large_patch16_384"
461
+ image_size: int = 384
462
+ patch_size: int = 16
463
+ width: int = 1024
464
+ layers: int = 24
465
+ heads: int = 16
466
+ mlp_ratio: int = 4
467
+ global_pool: str = "map"
468
+ ignore_head: bool = True
469
+ class_token: bool = False
470
+ num_classes: int = 0
471
+ use_checkpoint: bool = False
472
+ weight_init: str = "skip"
473
+ deterministic: bool = False
474
+ num_recomputing_layers: int = 0
475
+
476
+ def __init__(
477
+ self,
478
+ model_name: str = "siglip_large_patch16_384",
479
+ image_size: int = 384,
480
+ patch_size: int = 16,
481
+ width: int = 1024,
482
+ layers: int = 24,
483
+ heads: int = 16,
484
+ mlp_ratio: int = 4,
485
+ global_pool: str = "map",
486
+ ignore_head: bool = True,
487
+ class_token: bool = False,
488
+ num_classes: int = 0,
489
+ use_checkpoint: bool = False,
490
+ **kwargs,
491
+ ):
492
+ self.model_name = model_name
493
+ self.image_size = image_size
494
+ self.patch_size = patch_size
495
+ self.width = width
496
+ self.layers = layers
497
+ self.heads = heads
498
+ self.mlp_ratio = mlp_ratio
499
+ self.global_pool = global_pool
500
+ self.ignore_head = ignore_head
501
+ self.class_token = class_token
502
+ self.num_classes = num_classes
503
+ self.use_checkpoint = use_checkpoint
504
+
505
+ super().__init__(**kwargs)
506
+
507
+
508
+ class DeepseekVL2MlpProjectorConfig(PretrainedConfig):
509
+ model_type = "mlp_projector"
510
+ projector_type: str = "downsample_mlp_gelu"
511
+ input_dim: int = 1152
512
+ n_embed: int = 2048
513
+ depth: int = 2
514
+ mlp_ratio: int = 1
515
+ downsample_ratio: int = 2
516
+ token_pooling: bool = False
517
+
518
+ def __init__(
519
+ self,
520
+ projector_type: str = "downsample_mlp_gelu",
521
+ input_dim: int = 1152,
522
+ n_embed: int = 2048,
523
+ depth: int = 2,
524
+ mlp_ratio: int = 1,
525
+ downsample_ratio: int = 2,
526
+ **kwargs,
527
+ ):
528
+ self.projector_type = projector_type
529
+ self.input_dim = input_dim
530
+ self.n_embed = n_embed
531
+ self.depth = depth
532
+ self.mlp_ratio = mlp_ratio
533
+ self.downsample_ratio = downsample_ratio
534
+
535
+ super().__init__(**kwargs)
536
+
537
+
538
+ class DeepseekV2Config(PretrainedConfig):
539
+
540
+ model_type = "deepseek_v2"
541
+ keys_to_ignore_at_inference = ["past_key_values"]
542
+
543
+ def __init__(
544
+ self,
545
+ vocab_size=102400,
546
+ hidden_size=4096,
547
+ intermediate_size=11008,
548
+ moe_intermediate_size=1407,
549
+ num_hidden_layers=30,
550
+ num_attention_heads=32,
551
+ num_key_value_heads=32,
552
+ n_shared_experts=None,
553
+ n_routed_experts=None,
554
+ ep_size=1,
555
+ routed_scaling_factor=1.0,
556
+ kv_lora_rank=512,
557
+ q_lora_rank=1536,
558
+ qk_rope_head_dim=64,
559
+ v_head_dim=128,
560
+ qk_nope_head_dim=128,
561
+ topk_method="gready",
562
+ n_group=None,
563
+ topk_group=None,
564
+ num_experts_per_tok=None,
565
+ moe_layer_freq=1,
566
+ first_k_dense_replace=0,
567
+ norm_topk_prob=False,
568
+ scoring_func="softmax",
569
+ aux_loss_alpha=0.001,
570
+ seq_aux=True,
571
+ hidden_act="silu",
572
+ max_position_embeddings=2048,
573
+ initializer_range=0.02,
574
+ rms_norm_eps=1e-6,
575
+ use_cache=True,
576
+ pad_token_id=None,
577
+ bos_token_id=100000,
578
+ eos_token_id=100001,
579
+ pretraining_tp=1,
580
+ tie_word_embeddings=False,
581
+ rope_theta=10000.0,
582
+ rope_scaling=None,
583
+ attention_bias=False,
584
+ attention_dropout=0.0,
585
+ use_mla=True,
586
+ **kwargs,
587
+ ):
588
+ self.vocab_size = vocab_size
589
+ self.max_position_embeddings = max_position_embeddings
590
+ self.hidden_size = hidden_size
591
+ self.intermediate_size = intermediate_size
592
+ self.moe_intermediate_size = moe_intermediate_size
593
+ self.num_hidden_layers = num_hidden_layers
594
+ self.num_attention_heads = num_attention_heads
595
+ self.n_shared_experts = n_shared_experts
596
+ self.n_routed_experts = n_routed_experts
597
+ self.ep_size = ep_size
598
+ self.routed_scaling_factor = routed_scaling_factor
599
+ self.kv_lora_rank = kv_lora_rank
600
+ self.q_lora_rank = q_lora_rank
601
+ self.qk_rope_head_dim = qk_rope_head_dim
602
+ self.v_head_dim = v_head_dim
603
+ self.qk_nope_head_dim = qk_nope_head_dim
604
+ self.topk_method = topk_method
605
+ self.n_group = n_group
606
+ self.topk_group = topk_group
607
+ self.num_experts_per_tok = num_experts_per_tok
608
+ self.moe_layer_freq = moe_layer_freq
609
+ self.first_k_dense_replace = first_k_dense_replace
610
+ self.norm_topk_prob = norm_topk_prob
611
+ self.scoring_func = scoring_func
612
+ self.aux_loss_alpha = aux_loss_alpha
613
+ self.seq_aux = seq_aux
614
+ # for backward compatibility
615
+ if num_key_value_heads is None:
616
+ num_key_value_heads = num_attention_heads
617
+
618
+ self.num_key_value_heads = num_key_value_heads
619
+ self.hidden_act = hidden_act
620
+ self.initializer_range = initializer_range
621
+ self.rms_norm_eps = float(rms_norm_eps)
622
+ self.pretraining_tp = pretraining_tp
623
+ self.use_cache = use_cache
624
+ self.rope_theta = rope_theta
625
+ self.rope_scaling = rope_scaling
626
+ self.attention_bias = attention_bias
627
+ self.attention_dropout = attention_dropout
628
+ self.use_mla = use_mla
629
+
630
+ super().__init__(
631
+ pad_token_id=pad_token_id,
632
+ bos_token_id=bos_token_id,
633
+ eos_token_id=eos_token_id,
634
+ tie_word_embeddings=tie_word_embeddings,
635
+ **kwargs,
636
+ )
637
+
638
+
639
+ class DeepseekVL2Config(PretrainedConfig):
640
+ model_type = "deepseek_vl_v2"
641
+ vision_config: DeepseekVL2VisionEncoderConfig
642
+ projector_config: DeepseekVL2MlpProjectorConfig
643
+ language_config: DeepseekV2Config
644
+
645
+ tile_tag: str = "2D"
646
+ global_view_pos: str = "head"
647
+ candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384),)
648
+
649
+ def __init__(
650
+ self,
651
+ tile_tag: str = "tile_tag",
652
+ global_view_pos: str = "head",
653
+ candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384),),
654
+ **kwargs,
655
+ ):
656
+ super().__init__(**kwargs)
657
+
658
+ vision_config = kwargs.get("vision_config", {})
659
+ self.vision_config = DeepseekVL2VisionEncoderConfig(**vision_config)
660
+
661
+ projector_config = kwargs.get("projector_config", {})
662
+ self.projector_config = DeepseekVL2MlpProjectorConfig(**projector_config)
663
+
664
+ language_config = kwargs.get("language_config", {})
665
+ if isinstance(language_config, DeepseekV2Config):
666
+ self.language_config = language_config
667
+ else:
668
+ self.language_config = DeepseekV2Config(**language_config)
669
+
670
+ self.tile_tag = tile_tag
671
+ self.global_view_pos = global_view_pos
672
+ self.candidate_resolutions = candidate_resolutions
673
+ self.architectures = ["DeepseekVL2ForCausalLM"]
674
+
675
+
676
+ AutoProcessor.register(DeepseekVL2Config, DeepseekVLV2Processor)
@@ -9,8 +9,6 @@ import PIL
9
9
  import torch
10
10
  from PIL.Image import Image
11
11
  from transformers import (
12
- AutoImageProcessor,
13
- AutoProcessor,
14
12
  BaseImageProcessor,
15
13
  BatchFeature,
16
14
  LlamaConfig,
@@ -20,6 +18,7 @@ from transformers import (
20
18
  )
21
19
  from transformers.image_utils import to_numpy_array
22
20
 
21
+ from sglang.srt.configs.utils import register_image_processor, register_processor
23
22
  from sglang.srt.mm_utils import expand2square
24
23
 
25
24
 
@@ -625,5 +624,5 @@ class VLMImageProcessorConfig(PretrainedConfig):
625
624
  super().__init__(**kwargs)
626
625
 
627
626
 
628
- AutoProcessor.register(MultiModalityConfig, VLChatProcessor, exist_ok=True)
629
- AutoImageProcessor.register(VLMImageProcessorConfig, None, VLMImageProcessor, None)
627
+ register_processor(MultiModalityConfig, VLChatProcessor)
628
+ register_image_processor(MultiModalityConfig, VLMImageProcessor)
@@ -22,6 +22,7 @@ class LoadFormat(str, enum.Enum):
22
22
  MISTRAL = "mistral"
23
23
  LAYERED = "layered"
24
24
  JAX = "jax"
25
+ REMOTE = "remote"
25
26
 
26
27
 
27
28
  @dataclass