sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/bench_serving.py +49 -7
  2. sglang/lang/chat_template.py +24 -0
  3. sglang/srt/_custom_ops.py +59 -92
  4. sglang/srt/configs/model_config.py +5 -0
  5. sglang/srt/constrained/base_grammar_backend.py +5 -1
  6. sglang/srt/conversation.py +29 -4
  7. sglang/srt/custom_op.py +5 -0
  8. sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
  9. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  10. sglang/srt/entrypoints/engine.py +0 -5
  11. sglang/srt/layers/attention/flashattention_backend.py +678 -83
  12. sglang/srt/layers/attention/flashinfer_backend.py +5 -7
  13. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
  14. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  15. sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
  16. sglang/srt/layers/moe/ep_moe/layer.py +79 -80
  17. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
  18. sglang/srt/layers/moe/fused_moe_native.py +5 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  21. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
  30. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  31. sglang/srt/layers/moe/topk.py +49 -3
  32. sglang/srt/layers/quantization/__init__.py +5 -1
  33. sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  34. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
  35. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
  36. sglang/srt/layers/quantization/fp8.py +3 -1
  37. sglang/srt/layers/quantization/fp8_utils.py +1 -4
  38. sglang/srt/layers/quantization/moe_wna16.py +503 -0
  39. sglang/srt/layers/quantization/utils.py +1 -1
  40. sglang/srt/layers/quantization/w8a8_int8.py +2 -0
  41. sglang/srt/layers/radix_attention.py +2 -0
  42. sglang/srt/layers/rotary_embedding.py +63 -12
  43. sglang/srt/managers/cache_controller.py +34 -11
  44. sglang/srt/managers/mm_utils.py +202 -156
  45. sglang/srt/managers/multimodal_processor.py +0 -2
  46. sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
  47. sglang/srt/managers/multimodal_processors/clip.py +7 -26
  48. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
  49. sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
  50. sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
  51. sglang/srt/managers/multimodal_processors/llava.py +34 -14
  52. sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
  53. sglang/srt/managers/multimodal_processors/mlama.py +10 -23
  54. sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
  55. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
  56. sglang/srt/managers/schedule_batch.py +185 -128
  57. sglang/srt/managers/scheduler.py +4 -4
  58. sglang/srt/managers/tokenizer_manager.py +1 -1
  59. sglang/srt/managers/utils.py +1 -6
  60. sglang/srt/mem_cache/hiradix_cache.py +62 -52
  61. sglang/srt/mem_cache/memory_pool.py +72 -6
  62. sglang/srt/mem_cache/paged_allocator.py +39 -0
  63. sglang/srt/metrics/collector.py +23 -53
  64. sglang/srt/model_executor/cuda_graph_runner.py +8 -6
  65. sglang/srt/model_executor/forward_batch_info.py +10 -10
  66. sglang/srt/model_executor/model_runner.py +60 -57
  67. sglang/srt/model_loader/loader.py +8 -0
  68. sglang/srt/models/clip.py +12 -7
  69. sglang/srt/models/deepseek_janus_pro.py +10 -15
  70. sglang/srt/models/deepseek_v2.py +212 -121
  71. sglang/srt/models/deepseek_vl2.py +105 -104
  72. sglang/srt/models/gemma3_mm.py +14 -80
  73. sglang/srt/models/llama.py +16 -5
  74. sglang/srt/models/llama4.py +420 -0
  75. sglang/srt/models/llava.py +31 -19
  76. sglang/srt/models/llavavid.py +16 -7
  77. sglang/srt/models/minicpmo.py +63 -147
  78. sglang/srt/models/minicpmv.py +17 -27
  79. sglang/srt/models/mllama.py +29 -14
  80. sglang/srt/models/mllama4.py +154 -0
  81. sglang/srt/models/qwen2.py +9 -6
  82. sglang/srt/models/qwen2_5_vl.py +21 -31
  83. sglang/srt/models/qwen2_vl.py +20 -21
  84. sglang/srt/openai_api/adapter.py +18 -6
  85. sglang/srt/platforms/interface.py +371 -0
  86. sglang/srt/server_args.py +99 -14
  87. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
  88. sglang/srt/speculative/eagle_utils.py +140 -28
  89. sglang/srt/speculative/eagle_worker.py +93 -24
  90. sglang/srt/utils.py +104 -51
  91. sglang/test/test_custom_ops.py +55 -0
  92. sglang/test/test_utils.py +13 -26
  93. sglang/utils.py +2 -2
  94. sglang/version.py +1 -1
  95. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
  96. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
  97. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
  98. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
  99. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0
@@ -8,19 +8,10 @@ from typing import Optional
8
8
 
9
9
  import numpy as np
10
10
  import PIL
11
- import transformers
12
11
  from decord import VideoReader, cpu
13
- from openai import BadRequestError
14
12
  from PIL import Image
15
13
 
16
- from sglang.srt.utils import load_audio, load_image, logger
17
-
18
- global global_processor
19
-
20
-
21
- def get_global_processor():
22
- global global_processor
23
- return global_processor
14
+ from sglang.srt.utils import encode_video, load_audio, load_image, logger
24
15
 
25
16
 
26
17
  @dataclasses.dataclass
@@ -28,9 +19,6 @@ class BaseMultiModalProcessorOutput:
28
19
  # input_text, with each frame of video/image represented with a image_token
29
20
  input_text: str
30
21
 
31
- mm_data_hashes: Optional[list[int]]
32
- # images
33
- image_sizes: Optional[list[int]]
34
22
  # frames loaded from image and video, in given order
35
23
  images: Optional[list[PIL.Image]] = None
36
24
 
@@ -38,7 +26,7 @@ class BaseMultiModalProcessorOutput:
38
26
  audios: Optional[list[np.ndarray]] = None
39
27
 
40
28
  def normalize(self):
41
- for field_name in ["data_hashes", "image_sizes", "images", "audios"]:
29
+ for field_name in ["image_sizes", "images", "audios"]:
42
30
  field = getattr(self, field_name, None)
43
31
  if field is not None and isinstance(field, list) and len(field) == 0:
44
32
  setattr(self, field_name, None)
@@ -68,28 +56,35 @@ class BaseMultimodalProcessor(ABC):
68
56
  # FIXME: not accurate, model and image specific
69
57
  self.NUM_TOKEN_PER_FRAME = 330
70
58
 
71
- # Initialize global processor first
72
- init_global_processor(self, server_args)
73
-
74
- self.executor = concurrent.futures.ProcessPoolExecutor(
75
- initializer=init_global_processor,
59
+ self.io_executor = concurrent.futures.ThreadPoolExecutor(
60
+ max_workers=int(os.environ.get("SGLANG_IO_WORKERS", 4))
61
+ )
62
+ self.cpu_executor = concurrent.futures.ProcessPoolExecutor(
76
63
  mp_context=mp.get_context("fork"),
77
- initargs=(
78
- self,
79
- server_args,
80
- ),
81
- max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
64
+ max_workers=int(os.environ.get("SGLANG_CPU_WORKERS", os.cpu_count())),
82
65
  )
83
66
 
84
- def _build_processor(self, server_args):
85
- """Init the global processor for multi modal models."""
86
- from sglang.srt.hf_transformers_utils import get_processor
87
-
88
- return get_processor(
89
- server_args.tokenizer_path,
90
- tokenizer_mode=server_args.tokenizer_mode,
91
- trust_remote_code=server_args.trust_remote_code,
67
+ def process_mm_data(
68
+ self, input_text, images=None, videos=None, audios=None, **kwargs
69
+ ):
70
+ """
71
+ process multimodal data with transformers AutoProcessor
72
+ """
73
+ if images is not None:
74
+ kwargs["images"] = images
75
+ if videos is not None:
76
+ kwargs["videos"] = videos
77
+ if audios is not None:
78
+ kwargs["audios"] = audios
79
+
80
+ processor = self._processor
81
+ result = processor.__call__(
82
+ text=[input_text],
83
+ padding=True,
84
+ return_tensors="pt",
85
+ **kwargs,
92
86
  )
87
+ return result
93
88
 
94
89
  @abstractmethod
95
90
  async def process_mm_data_async(
@@ -116,33 +111,9 @@ class BaseMultimodalProcessor(ABC):
116
111
 
117
112
  return estimated_frames_list
118
113
 
119
- @staticmethod
120
- def encode_video(video_path, frame_count_limit=None):
121
- if not os.path.exists(video_path):
122
- logger.error(f"Video {video_path} does not exist")
123
- return []
124
-
125
- if frame_count_limit == 0:
126
- return []
127
-
128
- def uniform_sample(l, n):
129
- gap = len(l) / n
130
- idxs = [int(i * gap + gap / 2) for i in range(n)]
131
- return [l[i] for i in idxs]
132
-
133
- vr = VideoReader(video_path, ctx=cpu(0))
134
- sample_fps = round(vr.get_avg_fps() / 1) # FPS
135
- frame_indices = [i for i in range(0, len(vr), sample_fps)]
136
- if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
137
- frame_indices = uniform_sample(frame_indices, frame_count_limit)
138
-
139
- frames = vr.get_batch(frame_indices).asnumpy()
140
- frames = [Image.fromarray(v.astype("uint8")) for v in frames]
141
- return frames
142
-
143
114
  def load_mm_data(
144
115
  self,
145
- input_ids: list[int],
116
+ prompt: str,
146
117
  multimodal_tokens: MultimodalSpecialTokens,
147
118
  max_req_input_len: int,
148
119
  image_data: Optional[list] = None,
@@ -168,11 +139,11 @@ class BaseMultimodalProcessor(ABC):
168
139
  else:
169
140
  multimodal_tokens.image_token = multimodal_tokens.image_token
170
141
 
171
- if isinstance(input_ids, list) and return_text:
172
- assert len(input_ids) and isinstance(input_ids[0], int)
173
- input_text = self._processor.tokenizer.decode(input_ids)
142
+ if isinstance(prompt, list) and return_text:
143
+ assert len(prompt) and isinstance(prompt[0], int)
144
+ prompt = self._processor.tokenizer.decode(prompt)
174
145
  else:
175
- input_text = input_ids
146
+ prompt = prompt
176
147
  if return_text:
177
148
  import re
178
149
 
@@ -182,7 +153,7 @@ class BaseMultimodalProcessor(ABC):
182
153
  + ")"
183
154
  )
184
155
  # split text into list of normal text and special tokens
185
- text_parts = re.split(pattern, input_text)
156
+ text_parts = re.split(pattern, prompt)
186
157
 
187
158
  # TODO(mick): load from server_args, env, or sampling_params
188
159
  MAX_NUM_FRAMES = 30
@@ -218,7 +189,7 @@ class BaseMultimodalProcessor(ABC):
218
189
  ):
219
190
  # video
220
191
  path = image_file[len("video:") :]
221
- frames = BaseMultimodalProcessor.encode_video(
192
+ frames = encode_video(
222
193
  path, frame_count_limit=frames_to_process
223
194
  )
224
195
  else:
@@ -231,7 +202,16 @@ class BaseMultimodalProcessor(ABC):
231
202
  continue
232
203
 
233
204
  image_sizes += frames[0].size * len(frames)
234
- hashes += [hash(image_file)] * len(frames)
205
+
206
+ # Generate a hashable value for the image file
207
+ if isinstance(image_file, Image.Image):
208
+ # For PIL.Image objects, use the ID as a hashable value
209
+ hash_value = hash(id(image_file))
210
+ else:
211
+ # For other types (strings, etc.), use the regular hash
212
+ hash_value = hash(image_file)
213
+
214
+ hashes += [hash_value] * len(frames)
235
215
  images += frames
236
216
  image_index += 1
237
217
  if frames_to_process != 0:
@@ -252,24 +232,12 @@ class BaseMultimodalProcessor(ABC):
252
232
 
253
233
  except Exception as e:
254
234
  logger.error(f"An exception occurred while loading images: {e}")
255
- raise BadRequestError(
256
- f"An exception occurred while loading images: {e}"
257
- )
235
+ raise RuntimeError(f"An exception occurred while loading images: {e}")
258
236
 
259
237
  out = BaseMultiModalProcessorOutput(
260
- mm_data_hashes=hashes,
261
- image_sizes=image_sizes,
262
238
  images=images,
263
239
  audios=audios,
264
240
  input_text=new_text,
265
241
  )
266
242
  out.normalize()
267
243
  return out
268
-
269
-
270
- def init_global_processor(sglang_processor: BaseMultimodalProcessor, server_args):
271
- """
272
- Init the global processor for multimodal models."""
273
- global global_processor
274
- transformers.logging.set_verbosity_error()
275
- global_processor = sglang_processor._build_processor(server_args=server_args)
@@ -1,10 +1,9 @@
1
- import asyncio
2
1
  from typing import List, Union
3
2
 
4
3
  from sglang.srt.managers.multimodal_processors.base_processor import (
5
4
  BaseMultimodalProcessor,
6
- get_global_processor,
7
5
  )
6
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
8
7
  from sglang.srt.models.clip import CLIPModel
9
8
  from sglang.srt.utils import load_image
10
9
 
@@ -15,29 +14,6 @@ class ClipImageProcessor(BaseMultimodalProcessor):
15
14
  def __init__(self, hf_config, server_args, _processor):
16
15
  super().__init__(hf_config, server_args, _processor)
17
16
 
18
- @staticmethod
19
- def _process_single_image_task(images, input_text):
20
- # input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
21
- return get_global_processor()(
22
- images=images, text=input_text, return_tensors="pt"
23
- )
24
-
25
- async def _process_single_image(self, images, input_text):
26
- if self.executor is not None:
27
- loop = asyncio.get_event_loop()
28
- image_inputs = await loop.run_in_executor(
29
- self.executor,
30
- ClipImageProcessor._process_single_image_task,
31
- images,
32
- input_text,
33
- )
34
- else:
35
- image_inputs = self._processor(
36
- images=images, text=[input_text], return_tensors="pt"
37
- )
38
-
39
- return image_inputs
40
-
41
17
  async def process_mm_data_async(
42
18
  self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
43
19
  ):
@@ -56,8 +32,13 @@ class ClipImageProcessor(BaseMultimodalProcessor):
56
32
  else:
57
33
  images = load_image(image_data[0])[0]
58
34
 
59
- image_inputs = await self._process_single_image(images, input_text)
35
+ image_inputs = self.process_mm_data(input_text=input_text, images=images)
60
36
  image_inputs["data_hashes"] = [hash(str(image_data))]
61
37
  image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
38
+ image_inputs["mm_items"] = [
39
+ MultimodalDataItem(
40
+ pixel_values=image_inputs["pixel_values"], modality=Modality.IMAGE
41
+ )
42
+ ]
62
43
 
63
44
  return image_inputs
@@ -16,15 +16,14 @@
16
16
  # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
17
  # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
18
  # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19
- import asyncio
20
19
 
21
20
  import torch
22
21
 
23
22
  from sglang.srt.managers.multimodal_processors.base_processor import (
24
23
  BaseMultimodalProcessor,
25
24
  MultimodalSpecialTokens,
26
- get_global_processor,
27
25
  )
26
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
28
27
  from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
29
28
 
30
29
 
@@ -35,51 +34,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
35
34
  super().__init__(hf_config, server_args, _processor)
36
35
  self.IMAGE_TOKEN = "<image>"
37
36
 
38
- @staticmethod
39
- def _process_images_task(image, input_text, max_req_input_len):
40
- processor = get_global_processor()
41
- res = processor.__call__(
42
- conversations=input_text, images=image, max_req_input_len=max_req_input_len
43
- )
44
-
45
- image_token_id = processor.image_token_id
46
-
47
- res["im_token_id"] = image_token_id
48
- return res
49
-
50
- async def _process_images(self, image_data, input_text, max_req_input_len):
51
- if self.executor is not None:
52
- loop = asyncio.get_event_loop()
53
- image_inputs = await loop.run_in_executor(
54
- self.executor,
55
- DeepseekVL2ImageProcessor._process_images_task,
56
- image_data,
57
- input_text,
58
- max_req_input_len,
59
- )
60
- else:
61
- image_inputs = self._process_images_task(
62
- image_data, input_text, max_req_input_len
63
- )
64
-
65
- return image_inputs
66
-
67
- async def _process_images(self, image_data, input_text, max_req_input_len):
68
- if self.executor is not None:
69
- loop = asyncio.get_event_loop()
70
- image_inputs = await loop.run_in_executor(
71
- self.executor,
72
- DeepseekVL2ImageProcessor._process_images_task,
73
- image_data,
74
- input_text,
75
- max_req_input_len,
76
- )
77
- else:
78
- image_inputs = self._process_images_task(
79
- image_data, input_text, max_req_input_len
80
- )
81
- return image_inputs
82
-
83
37
  async def process_mm_data_async(
84
38
  self, image_data, input_ids, request_obj, max_req_input_len, *args, **kwargs
85
39
  ):
@@ -89,8 +43,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
89
43
  if not isinstance(image_data, list):
90
44
  image_data = [image_data]
91
45
 
92
- images, image_sizes = [], []
93
-
94
46
  image_token = self.IMAGE_TOKEN
95
47
  base_output = self.load_mm_data(
96
48
  input_ids,
@@ -98,8 +50,11 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
98
50
  multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
99
51
  max_req_input_len=max_req_input_len,
100
52
  )
101
- res = await self._process_images(
102
- base_output.images, base_output.input_text, max_req_input_len
53
+ res = self.process_mm_data(
54
+ input_text=base_output.input_text,
55
+ images=base_output.images,
56
+ max_req_input_len=max_req_input_len,
57
+ conversations=base_output.input_text,
103
58
  )
104
59
  images_seq_mask = res["images_seq_mask"]
105
60
  images_spatial_crop = res["images_spatial_crop"]
@@ -107,13 +62,17 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
107
62
  batched_images_spatial_crop.append(images_spatial_crop)
108
63
  batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
109
64
 
65
+ items = []
66
+ item = MultimodalDataItem(
67
+ pixel_values=res["images"],
68
+ modality=Modality.IMAGE,
69
+ image_emb_mask=images_seq_mask,
70
+ image_spatial_crop=batched_images_spatial_crop,
71
+ )
72
+ items += [item]
73
+
110
74
  return {
75
+ "mm_items": items,
111
76
  "input_ids": res["input_ids"].tolist(),
112
- "pixel_values": res["images"],
113
- "im_token_id": res["im_token_id"],
114
- "data_hashes": base_output.mm_data_hashes,
115
- "image_sizes": image_sizes,
116
- "images_emb_mask": images_seq_mask,
117
- "image_spatial_crop": batched_images_spatial_crop,
118
- "modalities": request_obj.modalities or ["image"],
77
+ "im_token_id": self._processor.image_token_id,
119
78
  }
@@ -7,8 +7,8 @@ from sglang.srt.managers.multimodal_processor import (
7
7
  )
8
8
  from sglang.srt.managers.multimodal_processors.base_processor import (
9
9
  MultimodalSpecialTokens,
10
- get_global_processor,
11
10
  )
11
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
12
12
  from sglang.srt.models.gemma3_mm import Gemma3ForConditionalGeneration
13
13
 
14
14
  # Copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -25,28 +25,6 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
25
25
  self.IM_START_TOKEN_ID = hf_config.boi_token_index
26
26
  self.IM_END_TOKEN_ID = hf_config.eoi_token_index
27
27
 
28
- async def _process_single_image(self, images, input_text) -> dict:
29
- if isinstance(images, list) and len(images) == 0:
30
- images = None
31
- processor = get_global_processor()
32
- result = processor.__call__(
33
- text=[input_text],
34
- images=images,
35
- padding=True,
36
- return_tensors="pt",
37
- # if RGBA, this needs to be set
38
- # images_kwargs={
39
- # "input_data_format": ChannelDimension.FIRST
40
- # }
41
- )
42
-
43
- pixel_values = getattr(result, "pixel_values", None)
44
-
45
- return {
46
- "input_ids": result.input_ids,
47
- "pixel_values": pixel_values,
48
- }
49
-
50
28
  async def process_mm_data_async(
51
29
  self,
52
30
  image_data: List[Union[str, bytes]],
@@ -63,21 +41,28 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
63
41
 
64
42
  image_token = self.IMAGE_TOKEN
65
43
  base_output = self.load_mm_data(
66
- input_ids=input_ids,
44
+ prompt=input_ids,
67
45
  image_data=image_data,
68
46
  multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
69
47
  max_req_input_len=max_req_input_len,
70
48
  discard_alpha_channel=True,
71
49
  )
72
50
 
73
- ret = await self._process_single_image(
51
+ ret = self.process_mm_data(
74
52
  input_text=base_output.input_text, images=base_output.images
75
53
  )
76
54
 
55
+ items = []
56
+ for i, image in enumerate(base_output.images):
57
+ item = MultimodalDataItem(
58
+ pixel_values=ret["pixel_values"][i],
59
+ modality=Modality.IMAGE,
60
+ )
61
+ items += [item]
62
+
77
63
  return {
64
+ "mm_items": items,
78
65
  "input_ids": ret["input_ids"].flatten().tolist(),
79
- "pixel_values": ret["pixel_values"],
80
- "data_hashes": base_output.mm_data_hashes,
81
66
  "im_start_id": self.IM_START_TOKEN_ID,
82
67
  "im_end_id": self.IM_END_TOKEN_ID,
83
68
  }
@@ -1,11 +1,10 @@
1
- import asyncio
2
1
  from typing import List, Union
3
2
 
4
3
  from sglang.srt.managers.multimodal_processors.base_processor import (
5
4
  BaseMultimodalProcessor,
6
5
  MultimodalSpecialTokens,
7
- get_global_processor,
8
6
  )
7
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
9
8
  from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
10
9
 
11
10
 
@@ -15,37 +14,6 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
15
14
  def __init__(self, hf_config, server_args, _processor):
16
15
  super().__init__(hf_config, server_args, _processor)
17
16
 
18
- @staticmethod
19
- def _process_images_task(images, input_text):
20
- processor = get_global_processor()
21
- result = processor.__call__(
22
- prompt=input_text, images=images, return_tensors="pt"
23
- )
24
- return {
25
- "input_ids": result["input_ids"],
26
- "pixel_values": result["pixel_values"],
27
- "images_emb_mask": result["images_emb_mask"],
28
- "im_start_id": processor.image_start_id,
29
- "im_end_id": processor.image_end_id,
30
- "im_token_id": processor.image_id,
31
- }
32
-
33
- async def _process_images(self, images, input_text):
34
- if self.executor is not None:
35
- loop = asyncio.get_event_loop()
36
- image_inputs = await loop.run_in_executor(
37
- self.executor,
38
- JanusProImageProcessor._process_images_task,
39
- images,
40
- input_text,
41
- )
42
- else:
43
- image_inputs = self._processor(
44
- images=images, text=input_text, return_tensors="pt"
45
- )
46
-
47
- return image_inputs
48
-
49
17
  async def process_mm_data_async(
50
18
  self,
51
19
  image_data: List[Union[str, bytes]],
@@ -60,25 +28,31 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
60
28
  if not isinstance(image_data, list):
61
29
  image_data = [image_data]
62
30
 
31
+ processor = self._processor
32
+
63
33
  base_out = self.load_mm_data(
64
- input_ids=input_ids,
34
+ prompt=input_ids,
65
35
  image_data=image_data,
66
- multimodal_tokens=MultimodalSpecialTokens(
67
- image_token="<image_placeholder>"
68
- ),
36
+ multimodal_tokens=MultimodalSpecialTokens(image_token=processor.image_tag),
69
37
  max_req_input_len=max_req_input_len,
70
38
  )
39
+
71
40
  images = base_out.images
72
- res = await self._process_images(images=images, input_text=base_out.input_text)
73
- # print(res)
74
- # print(base_out)
75
- # print("", res["images_emb_mask"].shape)
41
+ res = self.process_mm_data(
42
+ input_text=base_out.input_text,
43
+ prompt=base_out.input_text,
44
+ images=images,
45
+ )
76
46
  return {
47
+ "mm_items": [
48
+ MultimodalDataItem(
49
+ pixel_values=res["pixel_values"],
50
+ image_emb_mask=res["images_emb_mask"],
51
+ modality=Modality.IMAGE,
52
+ )
53
+ ],
77
54
  "input_ids": res["input_ids"].flatten().tolist(),
78
- "pixel_values": res["pixel_values"],
79
- "images_emb_mask": res["images_emb_mask"],
80
- "data_hashes": base_out.mm_data_hashes,
81
- "im_start_id": res["im_start_id"],
82
- "im_end_id": res["im_end_id"],
83
- "im_token_id": res["im_token_id"],
55
+ "im_start_id": processor.image_start_id,
56
+ "im_end_id": processor.image_end_id,
57
+ "im_token_id": processor.image_id,
84
58
  }
@@ -5,17 +5,26 @@ import numpy as np
5
5
 
6
6
  from sglang.srt.managers.multimodal_processors.base_processor import (
7
7
  BaseMultimodalProcessor,
8
- get_global_processor,
9
8
  )
9
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
10
10
  from sglang.srt.mm_utils import expand2square, process_anyres_image
11
- from sglang.srt.models.llava import LlavaMistralForCausalLM, LlavaQwenForCausalLM
11
+ from sglang.srt.models.llava import (
12
+ LlavaLlamaForCausalLM,
13
+ LlavaMistralForCausalLM,
14
+ LlavaQwenForCausalLM,
15
+ )
12
16
  from sglang.srt.models.llavavid import LlavaVidForCausalLM
13
17
  from sglang.srt.utils import load_image, logger
14
18
  from sglang.utils import get_exception_traceback
15
19
 
16
20
 
17
21
  class LlavaImageProcessor(BaseMultimodalProcessor):
18
- models = [LlavaVidForCausalLM, LlavaQwenForCausalLM, LlavaMistralForCausalLM]
22
+ models = [
23
+ LlavaLlamaForCausalLM,
24
+ LlavaVidForCausalLM,
25
+ LlavaQwenForCausalLM,
26
+ LlavaMistralForCausalLM,
27
+ ]
19
28
 
20
29
  def __init__(self, hf_config, server_args, _processor):
21
30
  super().__init__(hf_config, server_args, _processor)
@@ -25,11 +34,10 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
25
34
  image_data: Union[str, bytes],
26
35
  image_aspect_ratio: Optional[str] = None,
27
36
  image_grid_pinpoints: Optional[str] = None,
28
- image_processor=None,
37
+ processor=None,
29
38
  ):
30
- processor = get_global_processor()
31
39
 
32
- image_processor = image_processor or processor.image_processor
40
+ image_processor = processor.image_processor
33
41
 
34
42
  try:
35
43
  image, image_size = load_image(image_data)
@@ -72,18 +80,22 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
72
80
  async def _process_single_image(
73
81
  self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
74
82
  ):
75
- if self.executor is not None:
83
+ if self.cpu_executor is not None:
76
84
  loop = asyncio.get_event_loop()
77
85
  return await loop.run_in_executor(
78
- self.executor,
86
+ self.cpu_executor,
79
87
  LlavaImageProcessor._process_single_image_task,
80
88
  image_data,
81
89
  aspect_ratio,
82
90
  grid_pinpoints,
91
+ self._processor,
83
92
  )
84
93
  else:
85
94
  return self._process_single_image_task(
86
- image_data, aspect_ratio, grid_pinpoints
95
+ image_data,
96
+ aspect_ratio,
97
+ grid_pinpoints,
98
+ self._processor.image_processor,
87
99
  )
88
100
 
89
101
  async def process_mm_data_async(
@@ -134,14 +146,22 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
134
146
  pixel_values, image_hash, image_size = await self._process_single_image(
135
147
  image_data[0], aspect_ratio, grid_pinpoints
136
148
  )
137
- data_hashes = [image_hash]
138
149
  image_sizes = [image_size]
139
150
  else:
140
151
  raise ValueError(f"Invalid image data: {image_data}")
152
+ modality = Modality.IMAGE
153
+ if isinstance(request_obj.modalities, list):
154
+ if request_obj.modalities[0] == "multi-images":
155
+ modality = Modality.MULTI_IMAGES
156
+ elif request_obj.modalities[0] == "video":
157
+ modality = Modality.VIDEO
141
158
 
142
159
  return {
143
- "pixel_values": pixel_values,
144
- "data_hashes": data_hashes,
145
- "image_sizes": image_sizes,
146
- "modalities": request_obj.modalities or ["image"],
160
+ "mm_items": [
161
+ MultimodalDataItem(
162
+ pixel_values=pixel_values,
163
+ image_sizes=image_sizes,
164
+ modality=modality,
165
+ )
166
+ ],
147
167
  }