sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. sglang/bench_serving.py +1 -1
  2. sglang/lang/chat_template.py +29 -0
  3. sglang/srt/_custom_ops.py +19 -17
  4. sglang/srt/configs/__init__.py +2 -0
  5. sglang/srt/configs/janus_pro.py +629 -0
  6. sglang/srt/configs/model_config.py +24 -14
  7. sglang/srt/conversation.py +80 -2
  8. sglang/srt/custom_op.py +64 -3
  9. sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
  10. sglang/srt/distributed/parallel_state.py +10 -1
  11. sglang/srt/entrypoints/engine.py +5 -3
  12. sglang/srt/entrypoints/http_server.py +1 -1
  13. sglang/srt/function_call_parser.py +33 -2
  14. sglang/srt/hf_transformers_utils.py +16 -1
  15. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  16. sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
  17. sglang/srt/layers/attention/triton_backend.py +1 -3
  18. sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
  19. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
  20. sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
  21. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
  22. sglang/srt/layers/attention/vision.py +43 -62
  23. sglang/srt/layers/dp_attention.py +30 -2
  24. sglang/srt/layers/elementwise.py +411 -0
  25. sglang/srt/layers/linear.py +1 -1
  26. sglang/srt/layers/logits_processor.py +1 -0
  27. sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
  28. sglang/srt/layers/moe/ep_moe/layer.py +25 -9
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
  37. sglang/srt/layers/moe/router.py +342 -0
  38. sglang/srt/layers/parameter.py +10 -0
  39. sglang/srt/layers/quantization/__init__.py +90 -68
  40. sglang/srt/layers/quantization/blockwise_int8.py +1 -2
  41. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  46. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  48. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  49. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  50. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  51. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  63. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/quantization/fp8.py +174 -106
  68. sglang/srt/layers/quantization/fp8_kernel.py +210 -38
  69. sglang/srt/layers/quantization/fp8_utils.py +156 -15
  70. sglang/srt/layers/quantization/modelopt_quant.py +5 -1
  71. sglang/srt/layers/quantization/w8a8_fp8.py +128 -0
  72. sglang/srt/layers/quantization/w8a8_int8.py +152 -3
  73. sglang/srt/layers/rotary_embedding.py +5 -3
  74. sglang/srt/layers/sampler.py +29 -35
  75. sglang/srt/layers/vocab_parallel_embedding.py +0 -1
  76. sglang/srt/lora/backend/__init__.py +9 -12
  77. sglang/srt/managers/cache_controller.py +74 -8
  78. sglang/srt/managers/data_parallel_controller.py +1 -1
  79. sglang/srt/managers/image_processor.py +37 -631
  80. sglang/srt/managers/image_processors/base_image_processor.py +219 -0
  81. sglang/srt/managers/image_processors/janus_pro.py +79 -0
  82. sglang/srt/managers/image_processors/llava.py +152 -0
  83. sglang/srt/managers/image_processors/minicpmv.py +86 -0
  84. sglang/srt/managers/image_processors/mlama.py +60 -0
  85. sglang/srt/managers/image_processors/qwen_vl.py +161 -0
  86. sglang/srt/managers/io_struct.py +32 -15
  87. sglang/srt/managers/multi_modality_padding.py +134 -0
  88. sglang/srt/managers/schedule_batch.py +213 -118
  89. sglang/srt/managers/schedule_policy.py +40 -8
  90. sglang/srt/managers/scheduler.py +176 -683
  91. sglang/srt/managers/scheduler_output_processor_mixin.py +614 -0
  92. sglang/srt/managers/tokenizer_manager.py +6 -6
  93. sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
  94. sglang/srt/mem_cache/base_prefix_cache.py +6 -8
  95. sglang/srt/mem_cache/chunk_cache.py +12 -44
  96. sglang/srt/mem_cache/hiradix_cache.py +71 -34
  97. sglang/srt/mem_cache/memory_pool.py +81 -17
  98. sglang/srt/mem_cache/paged_allocator.py +283 -0
  99. sglang/srt/mem_cache/radix_cache.py +117 -36
  100. sglang/srt/model_executor/cuda_graph_runner.py +68 -20
  101. sglang/srt/model_executor/forward_batch_info.py +23 -10
  102. sglang/srt/model_executor/model_runner.py +63 -63
  103. sglang/srt/model_loader/loader.py +2 -1
  104. sglang/srt/model_loader/weight_utils.py +1 -1
  105. sglang/srt/models/deepseek_janus_pro.py +2127 -0
  106. sglang/srt/models/deepseek_nextn.py +23 -3
  107. sglang/srt/models/deepseek_v2.py +200 -191
  108. sglang/srt/models/grok.py +374 -119
  109. sglang/srt/models/minicpmv.py +28 -89
  110. sglang/srt/models/mllama.py +1 -1
  111. sglang/srt/models/qwen2.py +0 -1
  112. sglang/srt/models/qwen2_5_vl.py +25 -50
  113. sglang/srt/models/qwen2_vl.py +33 -49
  114. sglang/srt/openai_api/adapter.py +59 -35
  115. sglang/srt/openai_api/protocol.py +8 -1
  116. sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
  117. sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
  118. sglang/srt/server_args.py +24 -16
  119. sglang/srt/speculative/eagle_worker.py +75 -39
  120. sglang/srt/utils.py +104 -9
  121. sglang/test/runners.py +104 -10
  122. sglang/test/test_block_fp8.py +106 -16
  123. sglang/test/test_custom_ops.py +88 -0
  124. sglang/test/test_utils.py +20 -4
  125. sglang/utils.py +0 -4
  126. sglang/version.py +1 -1
  127. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/METADATA +9 -10
  128. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/RECORD +131 -84
  129. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/WHEEL +1 -1
  130. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/LICENSE +0 -0
  131. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ import concurrent
2
+ import concurrent.futures
3
+ import dataclasses
4
+ import multiprocessing as mp
5
+ import os
6
+ from abc import ABC, abstractmethod
7
+ from typing import Optional
8
+
9
+ import PIL
10
+ import transformers
11
+ from decord import VideoReader, cpu
12
+ from PIL import Image
13
+
14
+ from sglang.srt.server_args import ServerArgs
15
+ from sglang.srt.utils import load_image
16
+ from sglang.utils import logger
17
+
18
+ global global_processor
19
+
20
+
21
+ def get_global_processor():
22
+ global global_processor
23
+ return global_processor
24
+
25
+
26
+ def init_global_processor(sglang_image_processor, server_args: ServerArgs):
27
+ """Init the global processor for multi-modal models."""
28
+ global global_processor
29
+ transformers.logging.set_verbosity_error()
30
+ global_processor = sglang_image_processor._build_processor(server_args=server_args)
31
+
32
+
33
+ @dataclasses.dataclass
34
+ class BaseImageProcessorOutput:
35
+ image_hashes: list[int]
36
+ image_sizes: list[tuple[int, int]]
37
+ all_frames: [PIL.Image]
38
+ # input_text, with each frame of video/image represented as an image_token
39
+ input_text: str
40
+
41
+
42
+ class BaseImageProcessor(ABC):
43
+ def __init__(self, hf_config, server_args, _processor):
44
+ self.hf_config = hf_config
45
+ self._processor = _processor
46
+ self.server_args = server_args
47
+ # FIXME: not accurate, model and image specific
48
+ self.NUM_TOKEN_PER_FRAME = 330
49
+
50
+ self.executor = concurrent.futures.ProcessPoolExecutor(
51
+ initializer=init_global_processor,
52
+ mp_context=mp.get_context("fork"),
53
+ initargs=(
54
+ self,
55
+ server_args,
56
+ ),
57
+ max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
58
+ )
59
+
60
+ def _build_processor(self, server_args):
61
+ """Init the global processor for multi modal models."""
62
+ from sglang.srt.hf_transformers_utils import get_processor
63
+
64
+ return get_processor(
65
+ server_args.tokenizer_path,
66
+ tokenizer_mode=server_args.tokenizer_mode,
67
+ trust_remote_code=server_args.trust_remote_code,
68
+ )
69
+
70
+ @abstractmethod
71
+ async def process_images_async(
72
+ self, image_data, input_text, max_req_input_len, **kwargs
73
+ ):
74
+ pass
75
+
76
+ def get_estimated_frames_list(self, image_data):
77
+ """
78
+ estimate the total frame count from all visual input
79
+ """
80
+ # Before processing inputs
81
+ estimated_frames_list = []
82
+ for image in image_data:
83
+ if isinstance(image, str) and image.startswith("video:"):
84
+ path = image[len("video:") :]
85
+ # Estimate frames for the video
86
+ vr = VideoReader(path, ctx=cpu(0))
87
+ num_frames = len(vr)
88
+ else:
89
+ # For images, each contributes one frame
90
+ num_frames = 1
91
+ estimated_frames_list.append(num_frames)
92
+
93
+ return estimated_frames_list
94
+
95
+ @staticmethod
96
+ def encode_video(video_path, frame_count_limit=None):
97
+ if not os.path.exists(video_path):
98
+ logger.error(f"Video {video_path} does not exist")
99
+ return []
100
+
101
+ if frame_count_limit == 0:
102
+ return []
103
+
104
+ def uniform_sample(l, n):
105
+ gap = len(l) / n
106
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
107
+ return [l[i] for i in idxs]
108
+
109
+ vr = VideoReader(video_path, ctx=cpu(0))
110
+ sample_fps = round(vr.get_avg_fps() / 1) # FPS
111
+ frame_indices = [i for i in range(0, len(vr), sample_fps)]
112
+ if frame_count_limit is not None and len(frame_indices) > frame_count_limit:
113
+ frame_indices = uniform_sample(frame_indices, frame_count_limit)
114
+
115
+ frames = vr.get_batch(frame_indices).asnumpy()
116
+ frames = [Image.fromarray(v.astype("uint8")) for v in frames]
117
+ return frames
118
+
119
+ def load_images(
120
+ self,
121
+ input_ids: list,
122
+ image_data,
123
+ image_token: str,
124
+ max_req_input_len: int,
125
+ return_text: Optional[bool] = True,
126
+ discard_alpha_channel: bool = True,
127
+ ) -> BaseImageProcessorOutput:
128
+ """
129
+ Each frame of video/image will be replaced by a single image token
130
+
131
+ Args:
132
+
133
+ discard_alpha_channel: if True, discards the alpha channel in the returned images
134
+
135
+ """
136
+ image_hashes, image_sizes = [], []
137
+ all_frames = []
138
+ new_text_parts = []
139
+
140
+ if isinstance(input_ids, list) and return_text:
141
+ assert len(input_ids) and isinstance(input_ids[0], int)
142
+ input_text = self._processor.tokenizer.decode(input_ids)
143
+ else:
144
+ input_text = input_ids
145
+
146
+ if return_text:
147
+ text_parts = input_text.split(image_token)
148
+
149
+ # TODO(mick): load from server_args, env, or sampling_params
150
+ MAX_NUM_FRAMES = 30
151
+ estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
152
+ total_frame_count = sum(estimated_frames_list)
153
+ # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
154
+ # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
155
+ scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
156
+
157
+ assert len(image_data) == len(estimated_frames_list)
158
+
159
+ # Process each input with allocated frames
160
+ for image_index, (image, estimated_frames) in enumerate(
161
+ zip(image_data, estimated_frames_list)
162
+ ):
163
+ if len(all_frames) >= MAX_NUM_FRAMES:
164
+ max_frames_to_process = 0
165
+ else:
166
+ max_frames_to_process = max(1, int(estimated_frames * scaling_factor))
167
+
168
+ if max_frames_to_process == 0:
169
+ frames = []
170
+ else:
171
+ try:
172
+ if isinstance(image, str) and image.startswith("video:"):
173
+ path = image[len("video:") :]
174
+ frames = BaseImageProcessor.encode_video(
175
+ path, frame_count_limit=max_frames_to_process
176
+ )
177
+ else:
178
+ raw_image, _size = load_image(image)
179
+ if discard_alpha_channel:
180
+ raw_image = raw_image.convert("RGB")
181
+ frames = [raw_image]
182
+ assert len(frames) != 0
183
+ except FileNotFoundError as e:
184
+ print(e)
185
+ return None
186
+
187
+ image_sizes += [frames[0].size] * len(frames)
188
+ image_hashes += [hash(image)] * len(frames)
189
+ all_frames += frames
190
+
191
+ if return_text:
192
+ new_text_parts.append(text_parts[image_index])
193
+ if max_frames_to_process != 0:
194
+ new_text_parts.append(image_token * len(frames))
195
+ assert max_frames_to_process >= len(frames)
196
+ if return_text:
197
+ new_text_parts.append(text_parts[-1])
198
+
199
+ input_text = "".join(new_text_parts)
200
+ return BaseImageProcessorOutput(
201
+ image_hashes, image_sizes, all_frames, input_text
202
+ )
203
+
204
+
205
+ class DummyImageProcessor(BaseImageProcessor):
206
+ def __init__(self):
207
+ pass
208
+
209
+ async def process_images_async(self, *args, **kwargs):
210
+ return None
211
+
212
+
213
+ def init_global_processor(
214
+ sglang_image_processor: BaseImageProcessor, server_args: ServerArgs
215
+ ):
216
+ """Init the global processor for multi-modal models."""
217
+ global global_processor
218
+ transformers.logging.set_verbosity_error()
219
+ global_processor = sglang_image_processor._build_processor(server_args=server_args)
@@ -0,0 +1,79 @@
1
+ import asyncio
2
+ from typing import List, Union
3
+
4
+ from sglang.srt.managers.image_processors.base_image_processor import (
5
+ BaseImageProcessor as SGLangBaseImageProcessor,
6
+ )
7
+ from sglang.srt.managers.image_processors.base_image_processor import (
8
+ get_global_processor,
9
+ )
10
+ from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
11
+
12
+
13
+ class JanusProProcessor(SGLangBaseImageProcessor):
14
+ def __init__(self, hf_config, server_args, _processor):
15
+ super().__init__(hf_config, server_args, _processor)
16
+
17
+ @staticmethod
18
+ def _process_images_task(images, input_text):
19
+ processor = get_global_processor()
20
+ result = processor.__call__(
21
+ prompt=input_text, images=images, return_tensors="pt"
22
+ )
23
+ return {
24
+ "input_ids": result["input_ids"],
25
+ "pixel_values": result["pixel_values"],
26
+ "images_emb_mask": result["images_emb_mask"],
27
+ "im_start_id": processor.image_start_id,
28
+ "im_end_id": processor.image_end_id,
29
+ "im_token_id": processor.image_id,
30
+ }
31
+
32
+ async def _process_images(self, images, input_text):
33
+ if self.executor is not None:
34
+ loop = asyncio.get_event_loop()
35
+ image_inputs = await loop.run_in_executor(
36
+ self.executor,
37
+ JanusProProcessor._process_images_task,
38
+ images,
39
+ input_text,
40
+ )
41
+ else:
42
+ image_inputs = self._processor(
43
+ images=images, text=input_text, return_tensors="pt"
44
+ )
45
+
46
+ return image_inputs
47
+
48
+ async def process_images_async(
49
+ self,
50
+ image_data: List[Union[str, bytes]],
51
+ input_ids,
52
+ request_obj,
53
+ max_req_input_len,
54
+ **kwargs,
55
+ ):
56
+ if not image_data:
57
+ return None
58
+
59
+ if not isinstance(image_data, list):
60
+ image_data = [image_data]
61
+
62
+ base_out = self.load_images(
63
+ input_ids, image_data, "<image_placeholder>", max_req_input_len
64
+ )
65
+ images = base_out.all_frames
66
+ res = await self._process_images(images=images, input_text=base_out.input_text)
67
+
68
+ return {
69
+ "input_ids": res["input_ids"].flatten().tolist(),
70
+ "pixel_values": res["pixel_values"],
71
+ "images_emb_mask": res["images_emb_mask"],
72
+ "image_hashes": base_out.image_hashes,
73
+ "im_start_id": res["im_start_id"],
74
+ "im_end_id": res["im_end_id"],
75
+ "im_token_id": res["im_token_id"],
76
+ }
77
+
78
+
79
+ ImageProcessorMapping = {MultiModalityCausalLM: JanusProProcessor}
@@ -0,0 +1,152 @@
1
+ import asyncio
2
+ from typing import List, Optional, Union
3
+
4
+ import numpy as np
5
+
6
+ from sglang.srt.managers.image_processor import BaseImageProcessor
7
+ from sglang.srt.managers.image_processors.base_image_processor import (
8
+ get_global_processor,
9
+ )
10
+ from sglang.srt.mm_utils import expand2square, process_anyres_image
11
+ from sglang.srt.models.llava import LlavaMistralForCausalLM, LlavaQwenForCausalLM
12
+ from sglang.srt.models.llavavid import LlavaVidForCausalLM
13
+ from sglang.srt.utils import load_image, logger
14
+ from sglang.utils import get_exception_traceback
15
+
16
+
17
+ class LlavaImageProcessor(BaseImageProcessor):
18
+ def __init__(self, hf_config, server_args, _processor):
19
+ super().__init__(hf_config, server_args, _processor)
20
+
21
+ @staticmethod
22
+ def _process_single_image_task(
23
+ image_data: Union[str, bytes],
24
+ image_aspect_ratio: Optional[str] = None,
25
+ image_grid_pinpoints: Optional[str] = None,
26
+ image_processor=None,
27
+ ):
28
+ processor = get_global_processor()
29
+
30
+ image_processor = image_processor or processor.image_processor
31
+
32
+ try:
33
+ image, image_size = load_image(image_data)
34
+ if image_size is not None:
35
+ # It is a video with multiple images
36
+ image_hash = hash(image_data)
37
+ pixel_values = image_processor(image)["pixel_values"]
38
+ for _ in range(len(pixel_values)):
39
+ pixel_values[_] = pixel_values[_].astype(np.float16)
40
+ pixel_values = np.stack(pixel_values, axis=0)
41
+ return pixel_values, image_hash, image_size
42
+ else:
43
+ # It is an image
44
+ image_hash = hash(image_data)
45
+ if image_aspect_ratio == "pad":
46
+ image = expand2square(
47
+ image,
48
+ tuple(int(x * 255) for x in image_processor.image_mean),
49
+ )
50
+ pixel_values = image_processor(image.convert("RGB"))[
51
+ "pixel_values"
52
+ ][0]
53
+ elif image_aspect_ratio == "anyres" or (
54
+ image_aspect_ratio is not None
55
+ and "anyres_max" in image_aspect_ratio
56
+ ):
57
+ pixel_values = process_anyres_image(
58
+ image, image_processor, image_grid_pinpoints
59
+ )
60
+ else:
61
+ pixel_values = image_processor(image)["pixel_values"][0]
62
+
63
+ if isinstance(pixel_values, np.ndarray):
64
+ pixel_values = pixel_values.astype(np.float16)
65
+
66
+ return pixel_values, image_hash, image.size
67
+ except Exception:
68
+ logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
69
+
70
+ async def _process_single_image(
71
+ self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
72
+ ):
73
+ if self.executor is not None:
74
+ loop = asyncio.get_event_loop()
75
+ return await loop.run_in_executor(
76
+ self.executor,
77
+ LlavaImageProcessor._process_single_image_task,
78
+ image_data,
79
+ aspect_ratio,
80
+ grid_pinpoints,
81
+ )
82
+ else:
83
+ return self._process_single_image_task(
84
+ image_data, aspect_ratio, grid_pinpoints
85
+ )
86
+
87
+ async def process_images_async(
88
+ self,
89
+ image_data: List[Union[str, bytes]],
90
+ input_text,
91
+ request_obj,
92
+ *args,
93
+ **kwargs,
94
+ ):
95
+ if not image_data:
96
+ return None
97
+
98
+ modalities = request_obj.modalities or ["image"]
99
+ aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
100
+ grid_pinpoints = (
101
+ self.hf_config.image_grid_pinpoints
102
+ if hasattr(self.hf_config, "image_grid_pinpoints")
103
+ and "anyres" in aspect_ratio
104
+ else None
105
+ )
106
+
107
+ if isinstance(image_data, str):
108
+ image_data = [image_data]
109
+
110
+ if isinstance(image_data, list) and len(image_data) > 0:
111
+ if "multi-images" in modalities or "video" in modalities:
112
+ # Multiple images
113
+ aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
114
+ pixel_values, image_hashes, image_sizes = [], [], []
115
+ res = []
116
+ for img_data in image_data:
117
+ res.append(
118
+ self._process_single_image(
119
+ img_data, aspect_ratio, grid_pinpoints
120
+ )
121
+ )
122
+ res = await asyncio.gather(*res)
123
+ for pixel_v, image_h, image_s in res:
124
+ pixel_values.append(pixel_v)
125
+ image_hashes.append(image_h)
126
+ image_sizes.append(image_s)
127
+
128
+ if isinstance(pixel_values[0], np.ndarray):
129
+ pixel_values = np.stack(pixel_values, axis=0)
130
+ else:
131
+ # A single image
132
+ pixel_values, image_hash, image_size = await self._process_single_image(
133
+ image_data[0], aspect_ratio, grid_pinpoints
134
+ )
135
+ image_hashes = [image_hash]
136
+ image_sizes = [image_size]
137
+ else:
138
+ raise ValueError(f"Invalid image data: {image_data}")
139
+
140
+ return {
141
+ "pixel_values": pixel_values,
142
+ "image_hashes": image_hashes,
143
+ "image_sizes": image_sizes,
144
+ "modalities": request_obj.modalities or ["image"],
145
+ }
146
+
147
+
148
+ ImageProcessorMapping = {
149
+ LlavaVidForCausalLM: LlavaImageProcessor,
150
+ LlavaQwenForCausalLM: LlavaImageProcessor,
151
+ LlavaMistralForCausalLM: LlavaImageProcessor,
152
+ }
@@ -0,0 +1,86 @@
1
+ import asyncio
2
+ from typing import List, Union
3
+
4
+ from sglang.srt.managers.image_processor import BaseImageProcessor
5
+ from sglang.srt.managers.image_processors.base_image_processor import (
6
+ get_global_processor,
7
+ )
8
+ from sglang.srt.models.minicpmv import MiniCPMV
9
+
10
+
11
+ class MiniCPMVImageProcessor(BaseImageProcessor):
12
+ def __init__(self, hf_config, server_args, _processor):
13
+ super().__init__(hf_config, server_args, _processor)
14
+ self.IMAGE_TOKEN = "(<image>./</image>)"
15
+
16
+ @staticmethod
17
+ def _process_images_task(images, input_text):
18
+ processor = get_global_processor()
19
+ result = processor.__call__(text=input_text, images=images, return_tensors="pt")
20
+ return {
21
+ "input_ids": result.input_ids,
22
+ "pixel_values": result.pixel_values,
23
+ "tgt_sizes": result.tgt_sizes,
24
+ }
25
+
26
+ async def _process_images(self, images, input_text):
27
+ if self.executor is not None:
28
+ loop = asyncio.get_event_loop()
29
+ image_inputs = await loop.run_in_executor(
30
+ self.executor,
31
+ MiniCPMVImageProcessor._process_images_task,
32
+ images,
33
+ input_text,
34
+ )
35
+ else:
36
+ image_inputs = self._processor(
37
+ images=images, text=input_text, return_tensors="pt"
38
+ )
39
+
40
+ return image_inputs
41
+
42
+ async def process_images_async(
43
+ self,
44
+ image_data: List[Union[str, bytes]],
45
+ input_ids,
46
+ request_obj,
47
+ max_req_input_len,
48
+ ):
49
+ if not image_data:
50
+ return None
51
+ if not isinstance(image_data, list):
52
+ image_data = [image_data]
53
+
54
+ base_output = self.load_images(
55
+ input_ids, image_data, self.IMAGE_TOKEN, max_req_input_len
56
+ )
57
+ if base_output is None:
58
+ return None
59
+
60
+ if len(base_output.all_frames) == 0:
61
+ return None
62
+ res = await self._process_images(
63
+ images=base_output.all_frames, input_text=base_output.input_text
64
+ )
65
+
66
+ # Collect special token ids
67
+ tokenizer = self._processor.tokenizer
68
+ im_start_id = tokenizer.im_start_id
69
+ im_end_id = tokenizer.im_end_id
70
+ if tokenizer.slice_start_id:
71
+ slice_start_id = tokenizer.slice_start_id
72
+ slice_end_id = tokenizer.slice_end_id
73
+ return {
74
+ "input_ids": res["input_ids"].flatten().tolist(),
75
+ "pixel_values": res["pixel_values"],
76
+ "tgt_sizes": res["tgt_sizes"],
77
+ "image_hashes": base_output.image_hashes,
78
+ "modalities": request_obj.modalities or ["image"],
79
+ "im_start_id": im_start_id,
80
+ "im_end_id": im_end_id,
81
+ "slice_start_id": slice_start_id,
82
+ "slice_end_id": slice_end_id,
83
+ }
84
+
85
+
86
+ ImageProcessorMapping = {MiniCPMV: MiniCPMVImageProcessor}
@@ -0,0 +1,60 @@
1
+ import asyncio
2
+ from typing import List, Union
3
+
4
+ from sglang.srt.managers.image_processor import BaseImageProcessor
5
+ from sglang.srt.managers.image_processors.base_image_processor import (
6
+ get_global_processor,
7
+ )
8
+ from sglang.srt.models.mllama import MllamaForConditionalGeneration
9
+ from sglang.srt.utils import load_image
10
+
11
+
12
+ class MllamaImageProcessor(BaseImageProcessor):
13
+ def __init__(self, hf_config, server_args, _processor):
14
+ super().__init__(hf_config, server_args, _processor)
15
+
16
+ @staticmethod
17
+ def _process_single_image_task(images, input_text):
18
+ # input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
19
+ return get_global_processor()(images, input_text, return_tensors="pt")
20
+
21
+ async def _process_single_image(self, images, input_text):
22
+ if self.executor is not None:
23
+ loop = asyncio.get_event_loop()
24
+ image_inputs = await loop.run_in_executor(
25
+ self.executor,
26
+ MllamaImageProcessor._process_single_image_task,
27
+ images,
28
+ input_text,
29
+ )
30
+ else:
31
+ image_inputs = self._processor(images, input_text, return_tensors="pt")
32
+
33
+ return image_inputs
34
+
35
+ async def process_images_async(
36
+ self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
37
+ ):
38
+ if not image_data:
39
+ return None
40
+
41
+ if isinstance(input_text, list):
42
+ assert len(input_text) and isinstance(input_text[0], int)
43
+ input_text = self._processor.tokenizer.decode(input_text)
44
+
45
+ if not isinstance(image_data, list):
46
+ image_data = [image_data]
47
+
48
+ if len(image_data) > 0:
49
+ images = [load_image(image)[0] for image in image_data]
50
+ else:
51
+ images = load_image(image_data[0])[0]
52
+
53
+ image_inputs = await self._process_single_image(images, input_text)
54
+ image_inputs["image_hashes"] = [hash(str(image_data))]
55
+ image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
56
+
57
+ return image_inputs
58
+
59
+
60
+ ImageProcessorMapping = {MllamaForConditionalGeneration: MllamaImageProcessor}