sglang 0.4.4__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +6 -0
  3. sglang/bench_one_batch.py +1 -1
  4. sglang/bench_one_batch_server.py +1 -1
  5. sglang/bench_serving.py +3 -1
  6. sglang/check_env.py +3 -4
  7. sglang/lang/backend/openai.py +18 -5
  8. sglang/lang/chat_template.py +28 -7
  9. sglang/lang/interpreter.py +7 -3
  10. sglang/lang/ir.py +10 -0
  11. sglang/srt/_custom_ops.py +1 -1
  12. sglang/srt/code_completion_parser.py +174 -0
  13. sglang/srt/configs/__init__.py +2 -6
  14. sglang/srt/configs/deepseekvl2.py +667 -0
  15. sglang/srt/configs/janus_pro.py +3 -4
  16. sglang/srt/configs/load_config.py +1 -0
  17. sglang/srt/configs/model_config.py +63 -11
  18. sglang/srt/configs/utils.py +25 -0
  19. sglang/srt/connector/__init__.py +51 -0
  20. sglang/srt/connector/base_connector.py +112 -0
  21. sglang/srt/connector/redis.py +85 -0
  22. sglang/srt/connector/s3.py +122 -0
  23. sglang/srt/connector/serde/__init__.py +31 -0
  24. sglang/srt/connector/serde/safe_serde.py +29 -0
  25. sglang/srt/connector/serde/serde.py +43 -0
  26. sglang/srt/connector/utils.py +35 -0
  27. sglang/srt/conversation.py +88 -0
  28. sglang/srt/disaggregation/conn.py +81 -0
  29. sglang/srt/disaggregation/decode.py +495 -0
  30. sglang/srt/disaggregation/mini_lb.py +285 -0
  31. sglang/srt/disaggregation/prefill.py +249 -0
  32. sglang/srt/disaggregation/utils.py +44 -0
  33. sglang/srt/distributed/parallel_state.py +10 -3
  34. sglang/srt/entrypoints/engine.py +55 -5
  35. sglang/srt/entrypoints/http_server.py +71 -12
  36. sglang/srt/function_call_parser.py +164 -54
  37. sglang/srt/hf_transformers_utils.py +28 -3
  38. sglang/srt/layers/activation.py +4 -2
  39. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +295 -0
  41. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  42. sglang/srt/layers/attention/flashmla_backend.py +284 -0
  43. sglang/srt/layers/attention/triton_backend.py +171 -38
  44. sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
  45. sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
  46. sglang/srt/layers/attention/utils.py +53 -0
  47. sglang/srt/layers/attention/vision.py +9 -28
  48. sglang/srt/layers/dp_attention.py +62 -23
  49. sglang/srt/layers/elementwise.py +411 -0
  50. sglang/srt/layers/layernorm.py +24 -2
  51. sglang/srt/layers/linear.py +17 -5
  52. sglang/srt/layers/logits_processor.py +26 -7
  53. sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
  54. sglang/srt/layers/moe/ep_moe/layer.py +273 -1
  55. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
  56. sglang/srt/layers/moe/fused_moe_native.py +2 -1
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
  63. sglang/srt/layers/moe/router.py +342 -0
  64. sglang/srt/layers/moe/topk.py +31 -18
  65. sglang/srt/layers/parameter.py +1 -1
  66. sglang/srt/layers/quantization/__init__.py +184 -126
  67. sglang/srt/layers/quantization/base_config.py +5 -0
  68. sglang/srt/layers/quantization/blockwise_int8.py +1 -1
  69. sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
  70. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
  71. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
  72. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
  73. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
  74. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
  75. sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
  76. sglang/srt/layers/quantization/fp8.py +76 -34
  77. sglang/srt/layers/quantization/fp8_kernel.py +24 -8
  78. sglang/srt/layers/quantization/fp8_utils.py +284 -28
  79. sglang/srt/layers/quantization/gptq.py +36 -9
  80. sglang/srt/layers/quantization/kv_cache.py +98 -0
  81. sglang/srt/layers/quantization/modelopt_quant.py +9 -7
  82. sglang/srt/layers/quantization/utils.py +153 -0
  83. sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
  84. sglang/srt/layers/rotary_embedding.py +66 -87
  85. sglang/srt/layers/sampler.py +1 -1
  86. sglang/srt/lora/layers.py +68 -0
  87. sglang/srt/lora/lora.py +2 -22
  88. sglang/srt/lora/lora_manager.py +47 -23
  89. sglang/srt/lora/mem_pool.py +110 -51
  90. sglang/srt/lora/utils.py +12 -1
  91. sglang/srt/managers/cache_controller.py +4 -5
  92. sglang/srt/managers/data_parallel_controller.py +31 -9
  93. sglang/srt/managers/expert_distribution.py +81 -0
  94. sglang/srt/managers/io_struct.py +39 -3
  95. sglang/srt/managers/mm_utils.py +373 -0
  96. sglang/srt/managers/multimodal_processor.py +68 -0
  97. sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
  98. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
  99. sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
  100. sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
  101. sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
  102. sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
  103. sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
  104. sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
  105. sglang/srt/managers/schedule_batch.py +134 -31
  106. sglang/srt/managers/scheduler.py +325 -38
  107. sglang/srt/managers/scheduler_output_processor_mixin.py +4 -1
  108. sglang/srt/managers/session_controller.py +1 -1
  109. sglang/srt/managers/tokenizer_manager.py +59 -23
  110. sglang/srt/managers/tp_worker.py +1 -1
  111. sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
  112. sglang/srt/managers/utils.py +6 -1
  113. sglang/srt/mem_cache/hiradix_cache.py +27 -8
  114. sglang/srt/mem_cache/memory_pool.py +258 -98
  115. sglang/srt/mem_cache/paged_allocator.py +2 -2
  116. sglang/srt/mem_cache/radix_cache.py +4 -4
  117. sglang/srt/model_executor/cuda_graph_runner.py +85 -28
  118. sglang/srt/model_executor/forward_batch_info.py +81 -15
  119. sglang/srt/model_executor/model_runner.py +70 -6
  120. sglang/srt/model_loader/loader.py +160 -2
  121. sglang/srt/model_loader/weight_utils.py +45 -0
  122. sglang/srt/models/deepseek_janus_pro.py +29 -86
  123. sglang/srt/models/deepseek_nextn.py +22 -10
  124. sglang/srt/models/deepseek_v2.py +326 -192
  125. sglang/srt/models/deepseek_vl2.py +358 -0
  126. sglang/srt/models/gemma3_causal.py +684 -0
  127. sglang/srt/models/gemma3_mm.py +462 -0
  128. sglang/srt/models/grok.py +374 -119
  129. sglang/srt/models/llama.py +47 -7
  130. sglang/srt/models/llama_eagle.py +1 -0
  131. sglang/srt/models/llama_eagle3.py +196 -0
  132. sglang/srt/models/llava.py +3 -3
  133. sglang/srt/models/llavavid.py +3 -3
  134. sglang/srt/models/minicpmo.py +1995 -0
  135. sglang/srt/models/minicpmv.py +62 -137
  136. sglang/srt/models/mllama.py +4 -4
  137. sglang/srt/models/phi3_small.py +1 -1
  138. sglang/srt/models/qwen2.py +3 -0
  139. sglang/srt/models/qwen2_5_vl.py +68 -146
  140. sglang/srt/models/qwen2_classification.py +75 -0
  141. sglang/srt/models/qwen2_moe.py +9 -1
  142. sglang/srt/models/qwen2_vl.py +25 -63
  143. sglang/srt/openai_api/adapter.py +145 -47
  144. sglang/srt/openai_api/protocol.py +23 -2
  145. sglang/srt/sampling/sampling_batch_info.py +1 -1
  146. sglang/srt/sampling/sampling_params.py +6 -6
  147. sglang/srt/server_args.py +104 -14
  148. sglang/srt/speculative/build_eagle_tree.py +7 -347
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
  150. sglang/srt/speculative/eagle_utils.py +208 -252
  151. sglang/srt/speculative/eagle_worker.py +139 -53
  152. sglang/srt/speculative/spec_info.py +6 -1
  153. sglang/srt/torch_memory_saver_adapter.py +22 -0
  154. sglang/srt/utils.py +182 -21
  155. sglang/test/__init__.py +0 -0
  156. sglang/test/attention/__init__.py +0 -0
  157. sglang/test/attention/test_flashattn_backend.py +312 -0
  158. sglang/test/runners.py +2 -0
  159. sglang/test/test_activation.py +2 -1
  160. sglang/test/test_block_fp8.py +5 -4
  161. sglang/test/test_block_fp8_ep.py +2 -1
  162. sglang/test/test_dynamic_grad_mode.py +58 -0
  163. sglang/test/test_layernorm.py +3 -2
  164. sglang/test/test_utils.py +55 -4
  165. sglang/utils.py +31 -0
  166. sglang/version.py +1 -1
  167. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
  168. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +171 -125
  169. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
  170. sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
  171. sglang/srt/managers/image_processor.py +0 -55
  172. sglang/srt/managers/image_processors/base_image_processor.py +0 -219
  173. sglang/srt/managers/image_processors/minicpmv.py +0 -86
  174. sglang/srt/managers/multi_modality_padding.py +0 -134
  175. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
  176. {sglang-0.4.4.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,667 @@
1
+ import math
2
+ import os
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List, Optional, Tuple
5
+
6
+ import torch
7
+ import torchvision.transforms as T
8
+ from PIL import Image, ImageOps
9
+ from transformers import (
10
+ AutoProcessor,
11
+ LlamaTokenizerFast,
12
+ PretrainedConfig,
13
+ ProcessorMixin,
14
+ )
15
+
16
+
17
+ def select_best_resolution(image_size, candidate_resolutions):
18
+ # used for cropping
19
+ original_width, original_height = image_size
20
+ best_fit = None
21
+ max_effective_resolution = 0
22
+ min_wasted_resolution = float("inf")
23
+
24
+ for width, height in candidate_resolutions:
25
+ scale = min(width / original_width, height / original_height)
26
+ downscaled_width, downscaled_height = int(original_width * scale), int(
27
+ original_height * scale
28
+ )
29
+ effective_resolution = min(
30
+ downscaled_width * downscaled_height, original_width * original_height
31
+ )
32
+ wasted_resolution = (width * height) - effective_resolution
33
+
34
+ if effective_resolution > max_effective_resolution or (
35
+ effective_resolution == max_effective_resolution
36
+ and wasted_resolution < min_wasted_resolution
37
+ ):
38
+ max_effective_resolution = effective_resolution
39
+ min_wasted_resolution = wasted_resolution
40
+ best_fit = (width, height)
41
+
42
+ return best_fit
43
+
44
+
45
+ class DictOutput(object):
46
+ def keys(self):
47
+ return self.__dict__.keys()
48
+
49
+ def __getitem__(self, item):
50
+ return self.__dict__[item]
51
+
52
+ def __setitem__(self, key, value):
53
+ self.__dict__[key] = value
54
+
55
+
56
+ @dataclass
57
+ class VLChatProcessorOutput(DictOutput):
58
+ input_ids: torch.LongTensor
59
+ target_ids: torch.LongTensor
60
+ images: torch.Tensor
61
+ images_seq_mask: torch.BoolTensor
62
+ images_spatial_crop: torch.LongTensor
63
+
64
+ def __len__(self):
65
+ return len(self.input_ids)
66
+
67
+
68
+ class ImageTransform(object):
69
+ def __init__(
70
+ self,
71
+ mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
72
+ std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
73
+ normalize: bool = True,
74
+ ):
75
+ self.mean = mean
76
+ self.std = std
77
+ self.normalize = normalize
78
+
79
+ transform_pipelines = [T.ToTensor()]
80
+
81
+ if normalize:
82
+ transform_pipelines.append(T.Normalize(mean, std))
83
+
84
+ self.transform = T.Compose(transform_pipelines)
85
+
86
+ def __call__(self, pil_img: Image.Image):
87
+ x = self.transform(pil_img)
88
+ return x
89
+
90
+
91
+ class DeepseekVLV2Processor(ProcessorMixin):
92
+ tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
93
+ attributes = ["tokenizer"]
94
+
95
+ def __init__(
96
+ self,
97
+ tokenizer: LlamaTokenizerFast,
98
+ candidate_resolutions: Tuple[Tuple[int, int]],
99
+ patch_size: int,
100
+ downsample_ratio: int,
101
+ image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
102
+ image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
103
+ normalize: bool = True,
104
+ image_token: str = "<image>",
105
+ pad_token: str = "<|▁pad▁|>",
106
+ add_special_token: bool = False,
107
+ sft_format: str = "deepseek",
108
+ mask_prompt: bool = True,
109
+ ignore_id: int = -100,
110
+ **kwargs,
111
+ ):
112
+
113
+ self.candidate_resolutions = candidate_resolutions
114
+ self.image_size = candidate_resolutions[0][0]
115
+ self.patch_size = patch_size
116
+ self.image_mean = image_mean
117
+ self.image_std = image_std
118
+ self.normalize = normalize
119
+ self.downsample_ratio = downsample_ratio
120
+
121
+ self.image_transform = ImageTransform(
122
+ mean=image_mean, std=image_std, normalize=normalize
123
+ )
124
+ self.tokenizer = tokenizer
125
+ # must set this,padding side with make a difference in batch inference
126
+ self.tokenizer.padding_side = "left"
127
+
128
+ # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
129
+ if tokenizer.pad_token is None:
130
+ self.tokenizer.add_special_tokens({"pad_token": pad_token})
131
+
132
+ # add image token
133
+ image_token_id = self.tokenizer.vocab.get(image_token)
134
+ if image_token_id is None:
135
+ special_tokens = [image_token]
136
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
137
+ self.tokenizer.add_special_tokens(special_tokens_dict)
138
+ self.image_token_id = self.tokenizer.vocab.get(image_token)
139
+
140
+ # add five special tokens for grounding-related tasks
141
+ # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
142
+ special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
143
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
144
+ self.tokenizer.add_special_tokens(special_tokens_dict)
145
+
146
+ # add special tokens for SFT data
147
+ special_tokens = ["<|User|>", "<|Assistant|>"]
148
+ special_tokens_dict = {"additional_special_tokens": special_tokens}
149
+ self.tokenizer.add_special_tokens(special_tokens_dict)
150
+
151
+ self.image_token = image_token
152
+ self.pad_token = pad_token
153
+ self.add_special_token = add_special_token
154
+ self.sft_format = sft_format
155
+ self.mask_prompt = mask_prompt
156
+ self.ignore_id = ignore_id
157
+
158
+ super().__init__(
159
+ tokenizer,
160
+ **kwargs,
161
+ )
162
+
163
+ def format_messages_v2(self, messages, pil_images, max_req_input_len=-1):
164
+ """play the role of format_messages_v2 and get_images_info in the last version"""
165
+ tokenized_data = []
166
+ masked_tokenized_data = [] # labels
167
+ images_list = []
168
+ images_seq_mask = []
169
+ images_spatial_crop = []
170
+
171
+ image_index = 0
172
+ image_token_cnt = messages.count(self.image_token)
173
+ tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
174
+ messages,
175
+ pil_images[image_index : image_index + image_token_cnt],
176
+ bos=False,
177
+ eos=True,
178
+ cropping=len(pil_images) <= 2,
179
+ max_req_input_len=max_req_input_len,
180
+ )
181
+
182
+ image_index = image_token_cnt
183
+ tokenized_data += tokenized_str
184
+ if self.mask_prompt:
185
+ masked_tokenized_data += [self.ignore_id] * len(tokenized_str)
186
+ else:
187
+ masked_tokenized_data += tokenized_str
188
+ images_list += images
189
+ images_seq_mask += seq_mask
190
+ images_spatial_crop += spatial_crop
191
+
192
+ assert len(tokenized_data) == len(
193
+ images_seq_mask
194
+ ), f"format_messages_v2: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
195
+
196
+ return (
197
+ tokenized_data,
198
+ masked_tokenized_data,
199
+ images_list,
200
+ images_seq_mask,
201
+ images_spatial_crop,
202
+ )
203
+
204
+ @property
205
+ def bos_id(self):
206
+ return self.tokenizer.bos_token_id
207
+
208
+ @property
209
+ def eos_id(self):
210
+ return self.tokenizer.eos_token_id
211
+
212
+ @property
213
+ def pad_id(self):
214
+ return self.tokenizer.pad_token_id
215
+
216
+ def encode(self, text: str, bos: bool = True, eos: bool = False):
217
+ t = self.tokenizer.encode(text, add_special_tokens=False)
218
+
219
+ if bos:
220
+ t = [self.bos_id] + t
221
+ if eos:
222
+ t = t + [self.eos_id]
223
+
224
+ return t
225
+
226
+ def decode(self, t: List[int], **kwargs) -> str:
227
+ return self.tokenizer.decode(t, **kwargs)
228
+
229
+ def process_one(
230
+ self,
231
+ prompt: str = None,
232
+ conversations: List[Dict[str, str]] = None,
233
+ images: List[Image.Image] = None,
234
+ apply_sft_format: bool = False,
235
+ inference_mode: bool = True,
236
+ system_prompt: str = "",
237
+ max_req_input_len: int = -1,
238
+ **kwargs,
239
+ ):
240
+ """
241
+
242
+ Args:
243
+ prompt (str): the formatted prompt;
244
+ conversations (List[Dict]): conversations with a list of messages;
245
+ images (List[ImageType]): the list of images;
246
+ apply_sft_format (bool): if prompt is not None, then apply the SFT format to prompt;
247
+ if conversations is not None, then it will always apply the SFT format to conversations;
248
+ inference_mode (bool): if True, then remove the last eos token;
249
+ system_prompt (str): the system prompt;
250
+ **kwargs:
251
+
252
+ Returns:
253
+ outputs (BaseProcessorOutput): the output of the processor,
254
+ - input_ids (torch.LongTensor): [N + image tokens]
255
+ - target_ids (torch.LongTensor): [N + image tokens]
256
+ - images (torch.FloatTensor): [n_images, 3, H, W]
257
+ - image_id (int): the id of the image token
258
+ - num_image_tokens (List[int]): the number of image tokens
259
+ """
260
+
261
+ assert (
262
+ prompt is None or conversations is None
263
+ ), "prompt and conversations cannot be used at the same time."
264
+
265
+ (
266
+ tokenized_str,
267
+ masked_tokenized_str,
268
+ images_list,
269
+ images_seq_mask,
270
+ images_spatial_crop,
271
+ ) = self.format_messages_v2(conversations, images, max_req_input_len)
272
+
273
+ assert (
274
+ len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
275
+ ), (
276
+ f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
277
+ f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
278
+ )
279
+
280
+ input_ids = torch.LongTensor(tokenized_str)
281
+ target_ids = torch.LongTensor(masked_tokenized_str)
282
+ images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
283
+
284
+ # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
285
+ target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
286
+ self.ignore_id
287
+ )
288
+ input_ids[input_ids < 0] = self.pad_id
289
+
290
+ if inference_mode:
291
+ assert input_ids[-1] == self.eos_id
292
+ input_ids = input_ids[:-1]
293
+ target_ids = target_ids[:-1]
294
+ images_seq_mask = images_seq_mask[:-1]
295
+
296
+ if len(images_list) == 0:
297
+ images = torch.zeros((1, 3, self.image_size, self.image_size))
298
+ images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
299
+ else:
300
+ images = torch.stack(images_list, dim=0)
301
+ images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
302
+
303
+ prepare = VLChatProcessorOutput(
304
+ input_ids=input_ids,
305
+ target_ids=target_ids,
306
+ images=images,
307
+ images_seq_mask=images_seq_mask,
308
+ images_spatial_crop=images_spatial_crop,
309
+ )
310
+
311
+ return prepare
312
+
313
+ def __call__(
314
+ self,
315
+ *,
316
+ prompt: str = None,
317
+ conversations: List[Dict[str, str]] = None,
318
+ images: List[Image.Image] = None,
319
+ apply_sft_format: bool = False,
320
+ inference_mode: bool = True,
321
+ system_prompt: str = "",
322
+ max_req_input_len: int = -1,
323
+ **kwargs,
324
+ ):
325
+ prepare = self.process_one(
326
+ prompt=prompt,
327
+ conversations=conversations,
328
+ images=images,
329
+ apply_sft_format=apply_sft_format,
330
+ inference_mode=inference_mode,
331
+ system_prompt=system_prompt,
332
+ max_req_input_len=max_req_input_len,
333
+ )
334
+
335
+ return prepare
336
+
337
+ def find_all_indices(self, messages, target_value):
338
+ indices = []
339
+ for index, item in enumerate(messages):
340
+ if item == target_value:
341
+ indices.append(index)
342
+ return indices
343
+
344
+ def tokenize_with_images(
345
+ self,
346
+ conversation: str,
347
+ images: List[Image.Image],
348
+ bos: bool = True,
349
+ eos: bool = True,
350
+ cropping: bool = True,
351
+ max_req_input_len: int = -1,
352
+ ):
353
+ """Tokenize text with <image> tags."""
354
+ images_list, images_seq_mask, images_spatial_crop = [], [], []
355
+ text_splits = conversation.split(self.image_token)
356
+ tokenized_str = []
357
+ for text_sep, image in zip(text_splits, images):
358
+ """encode text_sep"""
359
+ tokenized_sep = self.encode(text_sep, bos=False, eos=False)
360
+ tokenized_str += tokenized_sep
361
+ images_seq_mask += [False] * len(tokenized_sep)
362
+
363
+ """select best resolution for anyres"""
364
+ if cropping:
365
+ best_width, best_height = select_best_resolution(
366
+ image.size, self.candidate_resolutions
367
+ )
368
+ else:
369
+ best_width, best_height = self.image_size, self.image_size
370
+ # print(image.size, (best_width, best_height)) # check the select_best_resolutions func
371
+
372
+ """process the global view"""
373
+ global_view = ImageOps.pad(
374
+ image,
375
+ (self.image_size, self.image_size),
376
+ color=tuple(int(x * 255) for x in self.image_transform.mean),
377
+ )
378
+ images_list.append(self.image_transform(global_view))
379
+
380
+ """process the local views"""
381
+ local_view = ImageOps.pad(
382
+ image,
383
+ (best_width, best_height),
384
+ color=tuple(int(x * 255) for x in self.image_transform.mean),
385
+ )
386
+ for i in range(0, best_height, self.image_size):
387
+ for j in range(0, best_width, self.image_size):
388
+ images_list.append(
389
+ self.image_transform(
390
+ local_view.crop(
391
+ (j, i, j + self.image_size, i + self.image_size)
392
+ )
393
+ )
394
+ )
395
+
396
+ """record height / width crop num"""
397
+ num_width_tiles, num_height_tiles = (
398
+ best_width // self.image_size,
399
+ best_height // self.image_size,
400
+ )
401
+ images_spatial_crop.append([num_width_tiles, num_height_tiles])
402
+
403
+ """add image tokens"""
404
+ h = w = math.ceil(
405
+ (self.image_size // self.patch_size) / self.downsample_ratio
406
+ )
407
+ # global views tokens h * (w + 1), 1 is for line seperator
408
+ tokenized_image = [self.image_token_id] * h * (w + 1)
409
+ # add a seperator between global and local views
410
+ tokenized_image += [self.image_token_id]
411
+ # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
412
+ tokenized_image += (
413
+ [self.image_token_id]
414
+ * (num_height_tiles * h)
415
+ * (num_width_tiles * w + 1)
416
+ )
417
+
418
+ tokenized_str += tokenized_image
419
+ images_seq_mask += [True] * len(tokenized_image)
420
+ # print(width_crop_num, height_crop_num, len(tokenized_image)) # test the correctness of the number of image-related tokens
421
+
422
+ """process the last text split"""
423
+ tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
424
+ # deal with video, limit with request len
425
+ if max_req_input_len > -1:
426
+ if max_req_input_len < len(tokenized_sep) + len(tokenized_str) - 1:
427
+ rest = max_req_input_len - len(tokenized_sep) - 1 - 1024
428
+ tokenized_str = tokenized_str[:rest]
429
+ images_seq_mask = images_seq_mask[:rest]
430
+ tokenized_str += tokenized_sep
431
+ images_seq_mask += [False] * len(tokenized_sep)
432
+
433
+ """add the bos and eos tokens"""
434
+ if bos:
435
+ tokenized_str = [self.bos_id] + tokenized_str
436
+ images_seq_mask = [False] + images_seq_mask
437
+ if eos:
438
+ tokenized_str = tokenized_str + [self.eos_id]
439
+ images_seq_mask = images_seq_mask + [False]
440
+
441
+ assert len(tokenized_str) == len(
442
+ images_seq_mask
443
+ ), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
444
+
445
+ return tokenized_str, images_list, images_seq_mask, images_spatial_crop
446
+
447
+
448
+ class DeepseekVL2VisionEncoderConfig(PretrainedConfig):
449
+ model_type: str = "vision"
450
+
451
+ model_name: str = "siglip_large_patch16_384"
452
+ image_size: int = 384
453
+ patch_size: int = 16
454
+ width: int = 1024
455
+ layers: int = 24
456
+ heads: int = 16
457
+ mlp_ratio: int = 4
458
+ global_pool: str = "map"
459
+ ignore_head: bool = True
460
+ class_token: bool = False
461
+ num_classes: int = 0
462
+ use_checkpoint: bool = False
463
+ weight_init: str = "skip"
464
+ deterministic: bool = False
465
+ num_recomputing_layers: int = 0
466
+
467
+ def __init__(
468
+ self,
469
+ model_name: str = "siglip_large_patch16_384",
470
+ image_size: int = 384,
471
+ patch_size: int = 16,
472
+ width: int = 1024,
473
+ layers: int = 24,
474
+ heads: int = 16,
475
+ mlp_ratio: int = 4,
476
+ global_pool: str = "map",
477
+ ignore_head: bool = True,
478
+ class_token: bool = False,
479
+ num_classes: int = 0,
480
+ use_checkpoint: bool = False,
481
+ **kwargs,
482
+ ):
483
+ self.model_name = model_name
484
+ self.image_size = image_size
485
+ self.patch_size = patch_size
486
+ self.width = width
487
+ self.layers = layers
488
+ self.heads = heads
489
+ self.mlp_ratio = mlp_ratio
490
+ self.global_pool = global_pool
491
+ self.ignore_head = ignore_head
492
+ self.class_token = class_token
493
+ self.num_classes = num_classes
494
+ self.use_checkpoint = use_checkpoint
495
+
496
+ super().__init__(**kwargs)
497
+
498
+
499
+ class DeepseekVL2MlpProjectorConfig(PretrainedConfig):
500
+ model_type = "mlp_projector"
501
+ projector_type: str = "downsample_mlp_gelu"
502
+ input_dim: int = 1152
503
+ n_embed: int = 2048
504
+ depth: int = 2
505
+ mlp_ratio: int = 1
506
+ downsample_ratio: int = 2
507
+ token_pooling: bool = False
508
+
509
+ def __init__(
510
+ self,
511
+ projector_type: str = "downsample_mlp_gelu",
512
+ input_dim: int = 1152,
513
+ n_embed: int = 2048,
514
+ depth: int = 2,
515
+ mlp_ratio: int = 1,
516
+ downsample_ratio: int = 2,
517
+ **kwargs,
518
+ ):
519
+ self.projector_type = projector_type
520
+ self.input_dim = input_dim
521
+ self.n_embed = n_embed
522
+ self.depth = depth
523
+ self.mlp_ratio = mlp_ratio
524
+ self.downsample_ratio = downsample_ratio
525
+
526
+ super().__init__(**kwargs)
527
+
528
+
529
+ class DeepseekV2Config(PretrainedConfig):
530
+
531
+ model_type = "deepseek_v2"
532
+ keys_to_ignore_at_inference = ["past_key_values"]
533
+
534
+ def __init__(
535
+ self,
536
+ vocab_size=102400,
537
+ hidden_size=4096,
538
+ intermediate_size=11008,
539
+ moe_intermediate_size=1407,
540
+ num_hidden_layers=30,
541
+ num_attention_heads=32,
542
+ num_key_value_heads=32,
543
+ n_shared_experts=None,
544
+ n_routed_experts=None,
545
+ ep_size=1,
546
+ routed_scaling_factor=1.0,
547
+ kv_lora_rank=512,
548
+ q_lora_rank=1536,
549
+ qk_rope_head_dim=64,
550
+ v_head_dim=128,
551
+ qk_nope_head_dim=128,
552
+ topk_method="gready",
553
+ n_group=None,
554
+ topk_group=None,
555
+ num_experts_per_tok=None,
556
+ moe_layer_freq=1,
557
+ first_k_dense_replace=0,
558
+ norm_topk_prob=False,
559
+ scoring_func="softmax",
560
+ aux_loss_alpha=0.001,
561
+ seq_aux=True,
562
+ hidden_act="silu",
563
+ max_position_embeddings=2048,
564
+ initializer_range=0.02,
565
+ rms_norm_eps=1e-6,
566
+ use_cache=True,
567
+ pad_token_id=None,
568
+ bos_token_id=100000,
569
+ eos_token_id=100001,
570
+ pretraining_tp=1,
571
+ tie_word_embeddings=False,
572
+ rope_theta=10000.0,
573
+ rope_scaling=None,
574
+ attention_bias=False,
575
+ attention_dropout=0.0,
576
+ use_mla=True,
577
+ **kwargs,
578
+ ):
579
+ self.vocab_size = vocab_size
580
+ self.max_position_embeddings = max_position_embeddings
581
+ self.hidden_size = hidden_size
582
+ self.intermediate_size = intermediate_size
583
+ self.moe_intermediate_size = moe_intermediate_size
584
+ self.num_hidden_layers = num_hidden_layers
585
+ self.num_attention_heads = num_attention_heads
586
+ self.n_shared_experts = n_shared_experts
587
+ self.n_routed_experts = n_routed_experts
588
+ self.ep_size = ep_size
589
+ self.routed_scaling_factor = routed_scaling_factor
590
+ self.kv_lora_rank = kv_lora_rank
591
+ self.q_lora_rank = q_lora_rank
592
+ self.qk_rope_head_dim = qk_rope_head_dim
593
+ self.v_head_dim = v_head_dim
594
+ self.qk_nope_head_dim = qk_nope_head_dim
595
+ self.topk_method = topk_method
596
+ self.n_group = n_group
597
+ self.topk_group = topk_group
598
+ self.num_experts_per_tok = num_experts_per_tok
599
+ self.moe_layer_freq = moe_layer_freq
600
+ self.first_k_dense_replace = first_k_dense_replace
601
+ self.norm_topk_prob = norm_topk_prob
602
+ self.scoring_func = scoring_func
603
+ self.aux_loss_alpha = aux_loss_alpha
604
+ self.seq_aux = seq_aux
605
+ # for backward compatibility
606
+ if num_key_value_heads is None:
607
+ num_key_value_heads = num_attention_heads
608
+
609
+ self.num_key_value_heads = num_key_value_heads
610
+ self.hidden_act = hidden_act
611
+ self.initializer_range = initializer_range
612
+ self.rms_norm_eps = float(rms_norm_eps)
613
+ self.pretraining_tp = pretraining_tp
614
+ self.use_cache = use_cache
615
+ self.rope_theta = rope_theta
616
+ self.rope_scaling = rope_scaling
617
+ self.attention_bias = attention_bias
618
+ self.attention_dropout = attention_dropout
619
+ self.use_mla = use_mla
620
+
621
+ super().__init__(
622
+ pad_token_id=pad_token_id,
623
+ bos_token_id=bos_token_id,
624
+ eos_token_id=eos_token_id,
625
+ tie_word_embeddings=tie_word_embeddings,
626
+ **kwargs,
627
+ )
628
+
629
+
630
+ class DeepseekVL2Config(PretrainedConfig):
631
+ model_type = "deepseek_vl_v2"
632
+ vision_config: DeepseekVL2VisionEncoderConfig
633
+ projector_config: DeepseekVL2MlpProjectorConfig
634
+ language_config: DeepseekV2Config
635
+
636
+ tile_tag: str = "2D"
637
+ global_view_pos: str = "head"
638
+ candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384),)
639
+
640
+ def __init__(
641
+ self,
642
+ tile_tag: str = "tile_tag",
643
+ global_view_pos: str = "head",
644
+ candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384),),
645
+ **kwargs,
646
+ ):
647
+ super().__init__(**kwargs)
648
+
649
+ vision_config = kwargs.get("vision_config", {})
650
+ self.vision_config = DeepseekVL2VisionEncoderConfig(**vision_config)
651
+
652
+ projector_config = kwargs.get("projector_config", {})
653
+ self.projector_config = DeepseekVL2MlpProjectorConfig(**projector_config)
654
+
655
+ language_config = kwargs.get("language_config", {})
656
+ if isinstance(language_config, DeepseekV2Config):
657
+ self.language_config = language_config
658
+ else:
659
+ self.language_config = DeepseekV2Config(**language_config)
660
+
661
+ self.tile_tag = tile_tag
662
+ self.global_view_pos = global_view_pos
663
+ self.candidate_resolutions = candidate_resolutions
664
+ self.architectures = ["DeepseekVL2ForCausalLM"]
665
+
666
+
667
+ AutoProcessor.register(DeepseekVL2Config, DeepseekVLV2Processor)
@@ -9,8 +9,6 @@ import PIL
9
9
  import torch
10
10
  from PIL.Image import Image
11
11
  from transformers import (
12
- AutoImageProcessor,
13
- AutoProcessor,
14
12
  BaseImageProcessor,
15
13
  BatchFeature,
16
14
  LlamaConfig,
@@ -20,6 +18,7 @@ from transformers import (
20
18
  )
21
19
  from transformers.image_utils import to_numpy_array
22
20
 
21
+ from sglang.srt.configs.utils import register_image_processor, register_processor
23
22
  from sglang.srt.mm_utils import expand2square
24
23
 
25
24
 
@@ -625,5 +624,5 @@ class VLMImageProcessorConfig(PretrainedConfig):
625
624
  super().__init__(**kwargs)
626
625
 
627
626
 
628
- AutoProcessor.register(MultiModalityConfig, VLChatProcessor, exist_ok=True)
629
- AutoImageProcessor.register(VLMImageProcessorConfig, None, VLMImageProcessor, None)
627
+ register_processor(MultiModalityConfig, VLChatProcessor)
628
+ register_image_processor(MultiModalityConfig, VLMImageProcessor)
@@ -22,6 +22,7 @@ class LoadFormat(str, enum.Enum):
22
22
  MISTRAL = "mistral"
23
23
  LAYERED = "layered"
24
24
  JAX = "jax"
25
+ REMOTE = "remote"
25
26
 
26
27
 
27
28
  @dataclass