sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. sglang/bench_serving.py +1 -1
  2. sglang/lang/chat_template.py +29 -0
  3. sglang/srt/_custom_ops.py +19 -17
  4. sglang/srt/configs/__init__.py +2 -0
  5. sglang/srt/configs/janus_pro.py +629 -0
  6. sglang/srt/configs/model_config.py +24 -14
  7. sglang/srt/conversation.py +80 -2
  8. sglang/srt/custom_op.py +64 -3
  9. sglang/srt/distributed/device_communicators/custom_all_reduce.py +18 -17
  10. sglang/srt/distributed/parallel_state.py +10 -1
  11. sglang/srt/entrypoints/engine.py +5 -3
  12. sglang/srt/entrypoints/http_server.py +1 -1
  13. sglang/srt/function_call_parser.py +33 -2
  14. sglang/srt/hf_transformers_utils.py +16 -1
  15. sglang/srt/layers/attention/flashinfer_backend.py +1 -1
  16. sglang/srt/layers/attention/flashinfer_mla_backend.py +317 -57
  17. sglang/srt/layers/attention/triton_backend.py +1 -3
  18. sglang/srt/layers/attention/triton_ops/decode_attention.py +6 -6
  19. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +3 -3
  20. sglang/srt/layers/attention/triton_ops/extend_attention.py +4 -4
  21. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +3 -3
  22. sglang/srt/layers/attention/vision.py +43 -62
  23. sglang/srt/layers/dp_attention.py +30 -2
  24. sglang/srt/layers/elementwise.py +411 -0
  25. sglang/srt/layers/linear.py +1 -1
  26. sglang/srt/layers/logits_processor.py +1 -0
  27. sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
  28. sglang/srt/layers/moe/ep_moe/layer.py +25 -9
  29. sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  33. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  35. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +63 -23
  36. sglang/srt/layers/moe/fused_moe_triton/layer.py +16 -4
  37. sglang/srt/layers/moe/router.py +342 -0
  38. sglang/srt/layers/parameter.py +10 -0
  39. sglang/srt/layers/quantization/__init__.py +90 -68
  40. sglang/srt/layers/quantization/blockwise_int8.py +1 -2
  41. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  44. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  46. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  47. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  48. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  49. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  50. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  51. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  60. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  63. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/quantization/fp8.py +174 -106
  68. sglang/srt/layers/quantization/fp8_kernel.py +210 -38
  69. sglang/srt/layers/quantization/fp8_utils.py +156 -15
  70. sglang/srt/layers/quantization/modelopt_quant.py +5 -1
  71. sglang/srt/layers/quantization/w8a8_fp8.py +128 -0
  72. sglang/srt/layers/quantization/w8a8_int8.py +152 -3
  73. sglang/srt/layers/rotary_embedding.py +5 -3
  74. sglang/srt/layers/sampler.py +29 -35
  75. sglang/srt/layers/vocab_parallel_embedding.py +0 -1
  76. sglang/srt/lora/backend/__init__.py +9 -12
  77. sglang/srt/managers/cache_controller.py +74 -8
  78. sglang/srt/managers/data_parallel_controller.py +1 -1
  79. sglang/srt/managers/image_processor.py +37 -631
  80. sglang/srt/managers/image_processors/base_image_processor.py +219 -0
  81. sglang/srt/managers/image_processors/janus_pro.py +79 -0
  82. sglang/srt/managers/image_processors/llava.py +152 -0
  83. sglang/srt/managers/image_processors/minicpmv.py +86 -0
  84. sglang/srt/managers/image_processors/mlama.py +60 -0
  85. sglang/srt/managers/image_processors/qwen_vl.py +161 -0
  86. sglang/srt/managers/io_struct.py +32 -15
  87. sglang/srt/managers/multi_modality_padding.py +134 -0
  88. sglang/srt/managers/schedule_batch.py +213 -118
  89. sglang/srt/managers/schedule_policy.py +40 -8
  90. sglang/srt/managers/scheduler.py +176 -683
  91. sglang/srt/managers/scheduler_output_processor_mixin.py +614 -0
  92. sglang/srt/managers/tokenizer_manager.py +6 -6
  93. sglang/srt/managers/tp_worker_overlap_thread.py +4 -1
  94. sglang/srt/mem_cache/base_prefix_cache.py +6 -8
  95. sglang/srt/mem_cache/chunk_cache.py +12 -44
  96. sglang/srt/mem_cache/hiradix_cache.py +71 -34
  97. sglang/srt/mem_cache/memory_pool.py +81 -17
  98. sglang/srt/mem_cache/paged_allocator.py +283 -0
  99. sglang/srt/mem_cache/radix_cache.py +117 -36
  100. sglang/srt/model_executor/cuda_graph_runner.py +68 -20
  101. sglang/srt/model_executor/forward_batch_info.py +23 -10
  102. sglang/srt/model_executor/model_runner.py +63 -63
  103. sglang/srt/model_loader/loader.py +2 -1
  104. sglang/srt/model_loader/weight_utils.py +1 -1
  105. sglang/srt/models/deepseek_janus_pro.py +2127 -0
  106. sglang/srt/models/deepseek_nextn.py +23 -3
  107. sglang/srt/models/deepseek_v2.py +200 -191
  108. sglang/srt/models/grok.py +374 -119
  109. sglang/srt/models/minicpmv.py +28 -89
  110. sglang/srt/models/mllama.py +1 -1
  111. sglang/srt/models/qwen2.py +0 -1
  112. sglang/srt/models/qwen2_5_vl.py +25 -50
  113. sglang/srt/models/qwen2_vl.py +33 -49
  114. sglang/srt/openai_api/adapter.py +59 -35
  115. sglang/srt/openai_api/protocol.py +8 -1
  116. sglang/srt/sampling/penaltylib/frequency_penalty.py +0 -1
  117. sglang/srt/sampling/penaltylib/presence_penalty.py +0 -1
  118. sglang/srt/server_args.py +24 -16
  119. sglang/srt/speculative/eagle_worker.py +75 -39
  120. sglang/srt/utils.py +104 -9
  121. sglang/test/runners.py +104 -10
  122. sglang/test/test_block_fp8.py +106 -16
  123. sglang/test/test_custom_ops.py +88 -0
  124. sglang/test/test_utils.py +20 -4
  125. sglang/utils.py +0 -4
  126. sglang/version.py +1 -1
  127. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/METADATA +9 -10
  128. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/RECORD +131 -84
  129. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/WHEEL +1 -1
  130. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/LICENSE +0 -0
  131. {sglang-0.4.3.post4.dist-info → sglang-0.4.4.post1.dist-info}/top_level.txt +0 -0
@@ -1,649 +1,55 @@
1
1
  # TODO: also move pad_input_ids into this module
2
- import asyncio
3
- import concurrent.futures
4
- import dataclasses
2
+ import importlib
5
3
  import logging
6
- import multiprocessing as mp
7
- import os
8
- from abc import ABC, abstractmethod
9
- from typing import List, Optional, Union
4
+ import pkgutil
5
+ from functools import lru_cache
10
6
 
11
- import numpy as np
12
- import PIL
13
- import transformers
14
- from decord import VideoReader, cpu
15
- from PIL import Image
7
+ from transformers import IMAGE_PROCESSOR_MAPPING
16
8
 
17
- from sglang.srt.hf_transformers_utils import get_processor
18
- from sglang.srt.mm_utils import expand2square, process_anyres_image
9
+ from sglang.srt.managers.image_processors.base_image_processor import (
10
+ BaseImageProcessor,
11
+ DummyImageProcessor,
12
+ )
19
13
  from sglang.srt.server_args import ServerArgs
20
- from sglang.srt.utils import load_image
21
- from sglang.utils import get_exception_traceback
22
14
 
23
15
  logger = logging.getLogger(__name__)
24
16
 
25
- global global_processor
26
17
 
27
-
28
- def init_global_processor(server_args: ServerArgs):
29
- """Init the global processor for multi modal models."""
30
- global global_processor
31
- transformers.logging.set_verbosity_error()
32
- global_processor = get_processor(
33
- server_args.tokenizer_path,
34
- tokenizer_mode=server_args.tokenizer_mode,
35
- trust_remote_code=server_args.trust_remote_code,
36
- )
37
-
38
-
39
- @dataclasses.dataclass
40
- class BaseImageProcessorOutput:
41
- image_hashes: list[int]
42
- image_sizes: list[int]
43
- all_frames: [PIL.Image]
44
- # input_text, with each frame of video/image represented with a image_token
45
- input_text: str
46
-
47
-
48
- class BaseImageProcessor(ABC):
49
- def __init__(self, hf_config, server_args, _processor):
50
- self.hf_config = hf_config
51
- self._processor = _processor
52
- self.server_args = server_args
53
- # FIXME: not accurate, model and image specific
54
- self.NUM_TOKEN_PER_FRAME = 330
55
-
56
- self.executor = concurrent.futures.ProcessPoolExecutor(
57
- initializer=init_global_processor,
58
- mp_context=mp.get_context("fork"),
59
- initargs=(server_args,),
60
- max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
61
- )
62
-
63
- @abstractmethod
64
- async def process_images_async(
65
- self, image_data, input_text, max_req_input_len, **kwargs
66
- ):
67
- pass
68
-
69
- def get_estimated_frames_list(self, image_data):
70
- """
71
- estimate the total frame count from all visual input
72
- """
73
- # Before processing inputs
74
- estimated_frames_list = []
75
- for image in image_data:
76
- if isinstance(image, str) and image.startswith("video:"):
77
- path = image[len("video:") :]
78
- # Estimate frames for the video
79
- vr = VideoReader(path, ctx=cpu(0))
80
- num_frames = len(vr)
81
- else:
82
- # For images, each contributes one frame
83
- num_frames = 1
84
- estimated_frames_list.append(num_frames)
85
-
86
- return estimated_frames_list
87
-
88
- def encode_video(self, video_path, frame_count_limit=None):
89
- if not os.path.exists(video_path):
90
- logger.error(f"Video {video_path} does not exist")
91
- return []
92
-
93
- if frame_count_limit == 0:
94
- return []
95
-
96
- def uniform_sample(l, n):
97
- gap = len(l) / n
98
- idxs = [int(i * gap + gap / 2) for i in range(n)]
99
- return [l[i] for i in idxs]
100
-
101
- vr = VideoReader(video_path, ctx=cpu(0))
102
- sample_fps = round(vr.get_avg_fps() / 1) # FPS
103
- frame_idx = [i for i in range(0, len(vr), sample_fps)]
104
- if frame_count_limit is not None and len(frame_idx) > frame_count_limit:
105
- frame_idx = uniform_sample(frame_idx, frame_count_limit)
106
- frames = vr.get_batch(frame_idx).asnumpy()
107
- frames = [Image.fromarray(v.astype("uint8")) for v in frames]
108
- return frames
109
-
110
- def load_images(
111
- self,
112
- max_req_input_len: int,
113
- input_ids: list,
114
- image_data,
115
- image_token: str,
116
- ) -> BaseImageProcessorOutput:
117
- """
118
- Each frame of video/image will be replaced by a single image token
119
- """
120
- image_hashes, image_sizes = [], []
121
- all_frames = []
122
- new_text_parts = []
123
-
124
- if isinstance(input_ids, list):
125
- assert len(input_ids) and isinstance(input_ids[0], int)
126
- input_text = self._processor.tokenizer.decode(input_ids)
127
- else:
128
- input_text = input_ids
129
-
130
- text_parts = input_text.split(image_token)
131
-
132
- # roughly calculate the max number of frames under the max_req_input_len limit
133
- def calculate_max_num_frames() -> int:
134
- ret = (max_req_input_len - len(input_ids)) // self.NUM_TOKEN_PER_FRAME
135
- return min(ret, 100)
136
-
137
- MAX_NUM_FRAMES = calculate_max_num_frames()
138
- estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
139
- total_frame_count = sum(estimated_frames_list)
140
- # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
141
- # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
142
- scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
143
-
144
- # Process each input with allocated frames
145
- for image_index, (image, estimated_frames) in enumerate(
146
- zip(image_data, estimated_frames_list)
147
- ):
148
- if len(all_frames) >= MAX_NUM_FRAMES:
149
- frames_to_process = 0
150
- else:
151
- frames_to_process = max(1, int(estimated_frames * scaling_factor))
152
-
153
- if frames_to_process == 0:
154
- frames = []
155
- else:
156
- try:
157
- if isinstance(image, str) and image.startswith("video:"):
158
- path = image[len("video:") :]
159
- frames = self.encode_video(
160
- path, frame_count_limit=frames_to_process
161
- )
162
- else:
163
- raw_image, _size = load_image(image)
164
- frames = [raw_image]
165
- if len(frames) == 0:
166
- continue
167
- except FileNotFoundError as e:
168
- print(e)
169
- return None
170
- image_sizes += frames[0].size * len(frames)
171
- image_hashes += [hash(image)] * len(frames)
172
- all_frames += frames
173
-
174
- new_text_parts.append(text_parts[image_index])
175
- if frames_to_process != 0:
176
- new_text_parts.append(image_token * len(frames))
177
- assert frames_to_process == len(frames)
178
-
179
- new_text_parts.append(text_parts[-1])
180
-
181
- input_text = "".join(new_text_parts)
182
- return BaseImageProcessorOutput(
183
- image_hashes, image_sizes, all_frames, input_text
184
- )
185
-
186
-
187
- class DummyImageProcessor(BaseImageProcessor):
188
- def __init__(self):
189
- pass
190
-
191
- async def process_images_async(self, *args, **kwargs):
192
- return None
193
-
194
-
195
- class LlavaImageProcessor(BaseImageProcessor):
196
- def __init__(self, hf_config, server_args, _processor):
197
- super().__init__(hf_config, server_args, _processor)
198
-
199
- @staticmethod
200
- def _process_single_image_task(
201
- image_data: Union[str, bytes],
202
- image_aspect_ratio: Optional[str] = None,
203
- image_grid_pinpoints: Optional[str] = None,
204
- image_processor=None,
205
- ):
206
- image_processor = image_processor or global_processor.image_processor
207
-
208
- try:
209
- image, image_size = load_image(image_data)
210
- if image_size is not None:
211
- # It is a video with multiple images
212
- image_hash = hash(image_data)
213
- pixel_values = image_processor(image)["pixel_values"]
214
- for _ in range(len(pixel_values)):
215
- pixel_values[_] = pixel_values[_].astype(np.float16)
216
- pixel_values = np.stack(pixel_values, axis=0)
217
- return pixel_values, image_hash, image_size
218
- else:
219
- # It is an image
220
- image_hash = hash(image_data)
221
- if image_aspect_ratio == "pad":
222
- image = expand2square(
223
- image,
224
- tuple(int(x * 255) for x in image_processor.image_mean),
225
- )
226
- pixel_values = image_processor(image.convert("RGB"))[
227
- "pixel_values"
228
- ][0]
229
- elif image_aspect_ratio == "anyres" or (
230
- image_aspect_ratio is not None
231
- and "anyres_max" in image_aspect_ratio
232
- ):
233
- pixel_values = process_anyres_image(
234
- image, image_processor, image_grid_pinpoints
235
- )
236
- else:
237
- pixel_values = image_processor(image)["pixel_values"][0]
238
-
239
- if isinstance(pixel_values, np.ndarray):
240
- pixel_values = pixel_values.astype(np.float16)
241
-
242
- return pixel_values, image_hash, image.size
243
- except Exception:
244
- logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
245
-
246
- async def _process_single_image(
247
- self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str
248
- ):
249
- if self.executor is not None:
250
- loop = asyncio.get_event_loop()
251
- return await loop.run_in_executor(
252
- self.executor,
253
- LlavaImageProcessor._process_single_image_task,
254
- image_data,
255
- aspect_ratio,
256
- grid_pinpoints,
257
- )
258
- else:
259
- return self._process_single_image_task(
260
- image_data, aspect_ratio, grid_pinpoints
261
- )
262
-
263
- async def process_images_async(
264
- self,
265
- image_data: List[Union[str, bytes]],
266
- input_text,
267
- request_obj,
268
- *args,
269
- **kwargs,
270
- ):
271
- if not image_data:
272
- return None
273
-
274
- modalities = request_obj.modalities or ["image"]
275
- aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", None)
276
- grid_pinpoints = (
277
- self.hf_config.image_grid_pinpoints
278
- if hasattr(self.hf_config, "image_grid_pinpoints")
279
- and "anyres" in aspect_ratio
280
- else None
281
- )
282
-
283
- if isinstance(image_data, str):
284
- image_data = [image_data]
285
-
286
- if isinstance(image_data, list) and len(image_data) > 0:
287
- if "multi-images" in modalities or "video" in modalities:
288
- # Multiple images
289
- aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres
290
- pixel_values, image_hashes, image_sizes = [], [], []
291
- res = []
292
- for img_data in image_data:
293
- res.append(
294
- self._process_single_image(
295
- img_data, aspect_ratio, grid_pinpoints
296
- )
297
- )
298
- res = await asyncio.gather(*res)
299
- for pixel_v, image_h, image_s in res:
300
- pixel_values.append(pixel_v)
301
- image_hashes.append(image_h)
302
- image_sizes.append(image_s)
303
-
304
- if isinstance(pixel_values[0], np.ndarray):
305
- pixel_values = np.stack(pixel_values, axis=0)
306
- else:
307
- # A single image
308
- pixel_values, image_hash, image_size = await self._process_single_image(
309
- image_data[0], aspect_ratio, grid_pinpoints
310
- )
311
- image_hashes = [image_hash]
312
- image_sizes = [image_size]
313
- else:
314
- raise ValueError(f"Invalid image data: {image_data}")
315
-
316
- return {
317
- "pixel_values": pixel_values,
318
- "image_hashes": image_hashes,
319
- "image_sizes": image_sizes,
320
- "modalities": request_obj.modalities or ["image"],
321
- }
322
-
323
-
324
- class MllamaImageProcessor(BaseImageProcessor):
325
- def __init__(self, hf_config, server_args, _processor):
326
- super().__init__(hf_config, server_args, _processor)
327
-
328
- @staticmethod
329
- def _process_single_image_task(images, input_text):
330
- # input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
331
- return global_processor(images, input_text, return_tensors="pt")
332
-
333
- async def _process_single_image(self, images, input_text):
334
- if self.executor is not None:
335
- loop = asyncio.get_event_loop()
336
- image_inputs = await loop.run_in_executor(
337
- self.executor,
338
- MllamaImageProcessor._process_single_image_task,
339
- images,
340
- input_text,
341
- )
342
- else:
343
- image_inputs = self._processor(images, input_text, return_tensors="pt")
344
-
345
- return image_inputs
346
-
347
- async def process_images_async(
348
- self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
349
- ):
350
- if not image_data:
351
- return None
352
-
353
- if isinstance(input_text, list):
354
- assert len(input_text) and isinstance(input_text[0], int)
355
- input_text = self._processor.tokenizer.decode(input_text)
356
-
357
- if not isinstance(image_data, list):
358
- image_data = [image_data]
359
-
360
- if len(image_data) > 0:
361
- images = [load_image(image)[0] for image in image_data]
362
- else:
363
- images = load_image(image_data[0])[0]
364
-
365
- image_inputs = await self._process_single_image(images, input_text)
366
- image_inputs["image_hashes"] = [hash(str(image_data))]
367
- image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
368
-
369
- return image_inputs
370
-
371
-
372
- class MiniCPMVImageProcessor(BaseImageProcessor):
373
- def __init__(self, hf_config, server_args, _processor):
374
- super().__init__(hf_config, server_args, _processor)
375
- self.IMAGE_TOKEN = "(<image>./</image>)"
376
-
377
- @staticmethod
378
- def _process_images_task(images, input_text):
379
- result = global_processor.__call__(
380
- text=input_text, images=images, return_tensors="pt"
381
- )
382
- return {
383
- "input_ids": result.input_ids,
384
- "pixel_values": result.pixel_values,
385
- "tgt_sizes": result.tgt_sizes,
386
- }
387
-
388
- async def _process_images(self, images, input_text):
389
- if self.executor is not None:
390
- loop = asyncio.get_event_loop()
391
- image_inputs = await loop.run_in_executor(
392
- self.executor,
393
- MiniCPMVImageProcessor._process_images_task,
394
- images,
395
- input_text,
396
- )
397
- else:
398
- image_inputs = self._processor(
399
- images=images, text=input_text, return_tensors="pt"
400
- )
401
-
402
- return image_inputs
403
-
404
- async def process_images_async(
405
- self,
406
- image_data: List[Union[str, bytes]],
407
- input_ids,
408
- request_obj,
409
- max_req_input_len,
410
- ):
411
- if not image_data:
412
- return None
413
- if not isinstance(image_data, list):
414
- image_data = [image_data]
415
-
416
- base_output = self.load_images(
417
- max_req_input_len, input_ids, image_data, self.IMAGE_TOKEN
418
- )
419
- if base_output is None:
420
- return None
421
-
422
- if len(base_output.all_frames) == 0:
423
- return None
424
- res = await self._process_images(
425
- images=base_output.all_frames, input_text=base_output.input_text
426
- )
427
-
428
- # Collect special token ids
429
- tokenizer = self._processor.tokenizer
430
- im_start_id = [tokenizer.im_start_id]
431
- im_end_id = [tokenizer.im_end_id]
432
- if tokenizer.slice_start_id:
433
- slice_start_id = [tokenizer.slice_start_id]
434
- slice_end_id = [tokenizer.slice_end_id]
435
- return {
436
- "input_ids": res["input_ids"].flatten().tolist(),
437
- "pixel_values": res["pixel_values"],
438
- "tgt_sizes": res["tgt_sizes"],
439
- "image_hashes": base_output.image_hashes,
440
- "modalities": request_obj.modalities or ["image"],
441
- "im_start_id": im_start_id,
442
- "im_end_id": im_end_id,
443
- "slice_start_id": slice_start_id,
444
- "slice_end_id": slice_end_id,
445
- }
446
-
447
-
448
- class Qwen2VLImageProcessor(BaseImageProcessor):
449
- def __init__(self, hf_config, server_args, _image_processor):
450
- self.hf_config = hf_config
451
- self._image_processor = _image_processor
452
- self.executor = concurrent.futures.ProcessPoolExecutor(
453
- initializer=init_global_processor,
454
- mp_context=mp.get_context("fork"),
455
- initargs=(server_args,),
456
- max_workers=int(os.environ.get("SGLANG_CPU_COUNT", os.cpu_count())),
457
- )
458
-
459
- @staticmethod
460
- def _process_single_image_task(
461
- image_data: Union[str, bytes],
462
- image_processor=None,
463
- ):
464
- image_processor = image_processor or global_processor.image_processor
465
-
466
- try:
467
- image, image_size = load_image(image_data)
468
- if image_size is not None:
469
- # It is a video with multiple images
470
- image_hash = hash(image_data)
471
- process_result = image_processor(image)
472
- pixel_values, image_grid_thws = (
473
- process_result["pixel_values"],
474
- process_result["image_grid_thw"][0],
475
- )
476
- for _ in range(len(pixel_values)):
477
- pixel_values[_] = pixel_values[_].astype(np.float16)
478
- pixel_values = np.stack(pixel_values, axis=0)
479
- image_grid_thws = np.stack(image_grid_thws, axis=0)
480
- return pixel_values, image_hash, image_size, image_grid_thws
481
- else:
482
- # It is an image
483
- image_hash = hash(image_data)
484
- process_result = image_processor(image)
485
- pixel_values, image_grid_thws = (
486
- process_result["pixel_values"],
487
- process_result["image_grid_thw"][0],
488
- )
489
- if isinstance(pixel_values, np.ndarray):
490
- pixel_values = pixel_values.astype(np.float16)
491
-
492
- return pixel_values, image_hash, image.size, image_grid_thws
493
- except Exception:
494
- logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
495
-
496
- async def _process_single_image(self, image_data: Union[bytes, str]):
497
- if self.executor is not None:
498
- loop = asyncio.get_event_loop()
499
- return await loop.run_in_executor(
500
- self.executor,
501
- Qwen2VLImageProcessor._process_single_image_task,
502
- image_data,
503
- )
504
- else:
505
- return self._process_single_image_task(image_data)
506
-
507
- async def process_images_async(
508
- self,
509
- image_data: List[Union[str, bytes]],
510
- input_text,
511
- request_obj,
512
- *args,
513
- **kwargs,
514
- ):
515
- if not image_data:
516
- return None
517
-
518
- if isinstance(image_data, list) and len(image_data) > 0:
519
- # Multiple images
520
- if len(image_data) > 1:
521
- pixel_values, image_hashes, image_sizes, image_grid_thws = (
522
- [],
523
- [],
524
- [],
525
- [],
526
- )
527
- res = []
528
- for img_data in image_data:
529
- res.append(self._process_single_image(img_data))
530
- res = await asyncio.gather(*res)
531
- for pixel_v, image_h, image_s, image_thw in res:
532
- pixel_values.append(pixel_v)
533
- image_hashes.append(image_h)
534
- image_sizes.append(image_s)
535
- image_grid_thws.append(image_thw)
536
-
537
- if isinstance(pixel_values[0], np.ndarray):
538
- pixel_values = np.concatenate(pixel_values, axis=0)
539
- else:
540
- # A single image
541
- pixel_values, image_hash, image_size, image_grid_thw = (
542
- await self._process_single_image(image_data[0])
543
- )
544
- image_hashes = [image_hash]
545
- image_sizes = [image_size]
546
- image_grid_thws = [image_grid_thw]
547
- elif isinstance(image_data, str) or isinstance(image_data, bytes):
548
- # A single image
549
- pixel_values, image_hash, image_size, image_grid_thw = (
550
- await self._process_single_image(image_data)
551
- )
552
- image_hashes = [image_hash]
553
- image_sizes = [image_size]
554
- image_grid_thws = [image_grid_thw]
555
- else:
556
-
557
- raise ValueError(f"Invalid image data: {image_data}")
558
-
559
- return {
560
- "pixel_values": pixel_values,
561
- "image_hashes": image_hashes,
562
- "image_sizes": image_sizes,
563
- "modalities": request_obj.modalities or ["image"],
564
- "image_grid_thws": image_grid_thws,
565
- }
566
-
567
-
568
- class Qwen2_5VLImageProcessor(BaseImageProcessor):
569
- def __init__(self, hf_config, server_args, _processor):
570
- super().__init__(hf_config, server_args, _processor)
571
- self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
572
- self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
573
- self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
574
- self.NUM_TOKEN_PER_FRAME = 770
575
-
576
- @staticmethod
577
- def _process_images_task(images, input_text):
578
- result = global_processor.__call__(
579
- text=input_text, images=images, return_tensors="pt"
580
- )
581
- return {
582
- "input_ids": result.input_ids,
583
- "pixel_values": result.pixel_values,
584
- "image_grid_thws": result.image_grid_thw,
585
- }
586
-
587
- async def _process_images(self, images, input_text) -> dict:
588
- if self.executor is not None:
589
- loop = asyncio.get_event_loop()
590
- return await loop.run_in_executor(
591
- self.executor,
592
- Qwen2_5VLImageProcessor._process_images_task,
593
- images,
594
- input_text,
595
- )
596
- else:
597
- return self._process_images_task(images, input_text)
598
-
599
- async def process_images_async(
600
- self,
601
- image_data: List[Union[str, bytes]],
602
- input_ids,
603
- request_obj,
604
- max_req_input_len,
605
- *args,
606
- **kwargs,
607
- ):
608
- if not image_data:
609
- return None
610
- if isinstance(image_data, str):
611
- image_data = [image_data]
612
-
613
- image_token = self.IMAGE_TOKEN
614
- base_output = self.load_images(
615
- max_req_input_len, input_ids, image_data, image_token
616
- )
617
-
618
- ret = await self._process_images(base_output.all_frames, base_output.input_text)
619
-
620
- return {
621
- "input_ids": ret["input_ids"].flatten().tolist(),
622
- "pixel_values": ret["pixel_values"],
623
- "image_hashes": base_output.image_hashes,
624
- "modalities": request_obj.modalities or ["image"],
625
- "image_grid_thws": ret["image_grid_thws"],
626
- "im_start_id": self.IM_START_TOKEN_ID,
627
- "im_end_id": self.IM_END_TOKEN_ID,
628
- }
18
+ IMAGE_PROCESSOR_MAPPING = {}
629
19
 
630
20
 
631
21
  def get_image_processor(
632
22
  hf_config, server_args: ServerArgs, processor
633
23
  ) -> BaseImageProcessor:
634
- if "MllamaForConditionalGeneration" in hf_config.architectures:
635
- return MllamaImageProcessor(hf_config, server_args, processor)
636
- elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
637
-
638
- return Qwen2VLImageProcessor(hf_config, server_args, processor)
639
- elif "Qwen2_5_VLForConditionalGeneration" in hf_config.architectures:
640
- return Qwen2_5VLImageProcessor(hf_config, server_args, processor)
641
-
642
- elif "MiniCPMV" in hf_config.architectures:
643
- return MiniCPMVImageProcessor(hf_config, server_args, processor)
644
- else:
645
- return LlavaImageProcessor(hf_config, server_args, processor.image_processor)
24
+ for model_cls, processor_cls in IMAGE_PROCESSOR_MAPPING.items():
25
+ if model_cls.__name__ in hf_config.architectures:
26
+ return processor_cls(hf_config, server_args, processor)
27
+ raise ValueError(
28
+ f"No image processor found for architecture: {hf_config.architectures}"
29
+ )
646
30
 
647
31
 
648
32
  def get_dummy_image_processor():
649
33
  return DummyImageProcessor()
34
+
35
+
36
+ @lru_cache()
37
+ def import_image_processors():
38
+ package_name = "sglang.srt.managers.image_processors"
39
+ package = importlib.import_module(package_name)
40
+ for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
41
+ if not ispkg:
42
+ try:
43
+ module = importlib.import_module(name)
44
+ except Exception as e:
45
+ logger.warning(f"Ignore import error when loading {name}: " f"{e}")
46
+ continue
47
+ if hasattr(module, "ImageProcessorMapping"):
48
+ entry = module.ImageProcessorMapping
49
+ if isinstance(entry, dict):
50
+ for processor_name, cls in entry.items():
51
+ IMAGE_PROCESSOR_MAPPING[processor_name] = cls
52
+
53
+
54
+ # also register processors
55
+ import_image_processors()