sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +2 -1
- sglang/lang/chat_template.py +17 -0
- sglang/launch_server_llavavid.py +1 -1
- sglang/srt/configs/__init__.py +3 -0
- sglang/srt/configs/model_config.py +27 -2
- sglang/srt/configs/qwen2vl.py +133 -0
- sglang/srt/constrained/fsm_cache.py +10 -3
- sglang/srt/conversation.py +27 -0
- sglang/srt/hf_transformers_utils.py +16 -1
- sglang/srt/layers/attention/__init__.py +16 -5
- sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
- sglang/srt/layers/attention/flashinfer_backend.py +174 -54
- sglang/srt/layers/attention/triton_backend.py +22 -6
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
- sglang/srt/layers/linear.py +89 -63
- sglang/srt/layers/logits_processor.py +5 -5
- sglang/srt/layers/rotary_embedding.py +112 -0
- sglang/srt/layers/sampler.py +51 -39
- sglang/srt/lora/lora.py +3 -1
- sglang/srt/managers/data_parallel_controller.py +1 -1
- sglang/srt/managers/detokenizer_manager.py +4 -0
- sglang/srt/managers/image_processor.py +186 -13
- sglang/srt/managers/io_struct.py +10 -0
- sglang/srt/managers/schedule_batch.py +238 -68
- sglang/srt/managers/scheduler.py +69 -50
- sglang/srt/managers/tokenizer_manager.py +24 -4
- sglang/srt/managers/tp_worker.py +26 -111
- sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
- sglang/srt/mem_cache/memory_pool.py +56 -10
- sglang/srt/mem_cache/radix_cache.py +4 -3
- sglang/srt/model_executor/cuda_graph_runner.py +87 -28
- sglang/srt/model_executor/forward_batch_info.py +83 -3
- sglang/srt/model_executor/model_runner.py +32 -11
- sglang/srt/models/chatglm.py +3 -3
- sglang/srt/models/deepseek_v2.py +2 -2
- sglang/srt/models/mllama.py +1004 -0
- sglang/srt/models/qwen2_vl.py +724 -0
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
- sglang/srt/sampling/sampling_batch_info.py +13 -3
- sglang/srt/sampling/sampling_params.py +5 -7
- sglang/srt/server.py +12 -0
- sglang/srt/server_args.py +10 -0
- sglang/srt/utils.py +22 -0
- sglang/test/run_eval.py +2 -0
- sglang/test/runners.py +20 -1
- sglang/test/srt/sampling/penaltylib/utils.py +1 -0
- sglang/test/test_utils.py +100 -3
- sglang/version.py +1 -1
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0
@@ -33,26 +33,32 @@ def init_global_processor(server_args: ServerArgs):
|
|
33
33
|
|
34
34
|
|
35
35
|
class BaseImageProcessor(ABC):
|
36
|
+
def __init__(self, hf_config, server_args, _processor):
|
37
|
+
self.hf_config = hf_config
|
38
|
+
self._processor = _processor
|
39
|
+
self.executor = concurrent.futures.ProcessPoolExecutor(
|
40
|
+
initializer=init_global_processor,
|
41
|
+
mp_context=mp.get_context("fork"),
|
42
|
+
initargs=(server_args,),
|
43
|
+
max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
|
44
|
+
)
|
45
|
+
|
36
46
|
@abstractmethod
|
37
|
-
async def process_images_async(self, image_data, **kwargs):
|
47
|
+
async def process_images_async(self, image_data, input_text, **kwargs):
|
38
48
|
pass
|
39
49
|
|
40
50
|
|
41
51
|
class DummyImageProcessor(BaseImageProcessor):
|
52
|
+
def __init__(self):
|
53
|
+
pass
|
54
|
+
|
42
55
|
async def process_images_async(self, *args, **kwargs):
|
43
56
|
return None
|
44
57
|
|
45
58
|
|
46
59
|
class LlavaImageProcessor(BaseImageProcessor):
|
47
|
-
def __init__(self, hf_config, server_args,
|
48
|
-
|
49
|
-
self._image_processor = _image_processor
|
50
|
-
self.executor = concurrent.futures.ProcessPoolExecutor(
|
51
|
-
initializer=init_global_processor,
|
52
|
-
mp_context=mp.get_context("fork"),
|
53
|
-
initargs=(server_args,),
|
54
|
-
max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
|
55
|
-
)
|
60
|
+
def __init__(self, hf_config, server_args, _processor):
|
61
|
+
super().__init__(hf_config, server_args, _processor)
|
56
62
|
|
57
63
|
@staticmethod
|
58
64
|
def _process_single_image_task(
|
@@ -119,7 +125,7 @@ class LlavaImageProcessor(BaseImageProcessor):
|
|
119
125
|
)
|
120
126
|
|
121
127
|
async def process_images_async(
|
122
|
-
self, image_data: List[Union[str, bytes]], request_obj
|
128
|
+
self, image_data: List[Union[str, bytes]], input_text, request_obj
|
123
129
|
):
|
124
130
|
if not image_data:
|
125
131
|
return None
|
@@ -177,10 +183,177 @@ class LlavaImageProcessor(BaseImageProcessor):
|
|
177
183
|
}
|
178
184
|
|
179
185
|
|
186
|
+
class MllamaImageProcessor(BaseImageProcessor):
|
187
|
+
def __init__(self, hf_config, server_args, _processor):
|
188
|
+
super().__init__(hf_config, server_args, _processor)
|
189
|
+
|
190
|
+
@staticmethod
|
191
|
+
def _process_single_image_task(images, input_text):
|
192
|
+
# input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
|
193
|
+
return global_processor(images, input_text, return_tensors="pt")
|
194
|
+
|
195
|
+
async def _process_single_image(self, images, input_text):
|
196
|
+
if self.executor is not None:
|
197
|
+
loop = asyncio.get_event_loop()
|
198
|
+
image_inputs = await loop.run_in_executor(
|
199
|
+
self.executor,
|
200
|
+
MllamaImageProcessor._process_single_image_task,
|
201
|
+
images,
|
202
|
+
input_text,
|
203
|
+
)
|
204
|
+
else:
|
205
|
+
image_inputs = self._processor(images, input_text, return_tensors="pt")
|
206
|
+
|
207
|
+
return image_inputs
|
208
|
+
|
209
|
+
async def process_images_async(
|
210
|
+
self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
|
211
|
+
):
|
212
|
+
if not image_data:
|
213
|
+
return None
|
214
|
+
|
215
|
+
if isinstance(input_text, list):
|
216
|
+
assert len(input_text) and isinstance(input_text[0], int)
|
217
|
+
input_text = self._processor.tokenizer.decode(input_text)
|
218
|
+
|
219
|
+
if not isinstance(image_data, list):
|
220
|
+
image_data = [image_data]
|
221
|
+
|
222
|
+
if len(image_data) > 0:
|
223
|
+
images = [load_image(image)[0] for image in image_data]
|
224
|
+
else:
|
225
|
+
images = load_image(image_data[0])[0]
|
226
|
+
|
227
|
+
image_inputs = await self._process_single_image(images, input_text)
|
228
|
+
image_inputs["image_hashes"] = [hash(str(image_data))]
|
229
|
+
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
230
|
+
|
231
|
+
return image_inputs
|
232
|
+
|
233
|
+
|
234
|
+
class Qwen2VLImageProcessor(BaseImageProcessor):
|
235
|
+
def __init__(self, hf_config, server_args, _image_processor):
|
236
|
+
self.hf_config = hf_config
|
237
|
+
self._image_processor = _image_processor
|
238
|
+
self.executor = concurrent.futures.ProcessPoolExecutor(
|
239
|
+
initializer=init_global_processor,
|
240
|
+
mp_context=mp.get_context("fork"),
|
241
|
+
initargs=(server_args,),
|
242
|
+
max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
|
243
|
+
)
|
244
|
+
|
245
|
+
@staticmethod
|
246
|
+
def _process_single_image_task(
|
247
|
+
image_data: Union[str, bytes],
|
248
|
+
image_processor=None,
|
249
|
+
):
|
250
|
+
image_processor = image_processor or global_processor.image_processor
|
251
|
+
|
252
|
+
try:
|
253
|
+
image, image_size = load_image(image_data)
|
254
|
+
if image_size is not None:
|
255
|
+
# It is a video with multiple images
|
256
|
+
image_hash = hash(image_data)
|
257
|
+
process_result = image_processor(image)
|
258
|
+
pixel_values, image_grid_thws = (
|
259
|
+
process_result["pixel_values"],
|
260
|
+
process_result["image_grid_thw"][0],
|
261
|
+
)
|
262
|
+
for _ in range(len(pixel_values)):
|
263
|
+
pixel_values[_] = pixel_values[_].astype(np.float16)
|
264
|
+
pixel_values = np.stack(pixel_values, axis=0)
|
265
|
+
image_grid_thws = np.stack(image_grid_thws, axis=0)
|
266
|
+
return pixel_values, image_hash, image_size, image_grid_thws
|
267
|
+
else:
|
268
|
+
# It is an image
|
269
|
+
image_hash = hash(image_data)
|
270
|
+
process_result = image_processor(image)
|
271
|
+
pixel_values, image_grid_thws = (
|
272
|
+
process_result["pixel_values"],
|
273
|
+
process_result["image_grid_thw"][0],
|
274
|
+
)
|
275
|
+
if isinstance(pixel_values, np.ndarray):
|
276
|
+
pixel_values = pixel_values.astype(np.float16)
|
277
|
+
|
278
|
+
return pixel_values, image_hash, image.size, image_grid_thws
|
279
|
+
except Exception:
|
280
|
+
logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
|
281
|
+
|
282
|
+
async def _process_single_image(self, image_data: Union[bytes, str]):
|
283
|
+
if self.executor is not None:
|
284
|
+
loop = asyncio.get_event_loop()
|
285
|
+
return await loop.run_in_executor(
|
286
|
+
self.executor,
|
287
|
+
Qwen2VLImageProcessor._process_single_image_task,
|
288
|
+
image_data,
|
289
|
+
)
|
290
|
+
else:
|
291
|
+
return self._process_single_image_task(image_data)
|
292
|
+
|
293
|
+
async def process_images_async(
|
294
|
+
self, image_data: List[Union[str, bytes]], input_text, request_obj
|
295
|
+
):
|
296
|
+
if not image_data:
|
297
|
+
return None
|
298
|
+
|
299
|
+
if isinstance(image_data, list) and len(image_data) > 0:
|
300
|
+
# Multiple images
|
301
|
+
if len(image_data) > 1:
|
302
|
+
pixel_values, image_hashes, image_sizes, image_grid_thws = (
|
303
|
+
[],
|
304
|
+
[],
|
305
|
+
[],
|
306
|
+
[],
|
307
|
+
)
|
308
|
+
res = []
|
309
|
+
for img_data in image_data:
|
310
|
+
res.append(self._process_single_image(img_data))
|
311
|
+
res = await asyncio.gather(*res)
|
312
|
+
for pixel_v, image_h, image_s, image_thw in res:
|
313
|
+
pixel_values.append(pixel_v)
|
314
|
+
image_hashes.append(image_h)
|
315
|
+
image_sizes.append(image_s)
|
316
|
+
image_grid_thws.append(image_thw)
|
317
|
+
|
318
|
+
if isinstance(pixel_values[0], np.ndarray):
|
319
|
+
pixel_values = np.concatenate(pixel_values, axis=0)
|
320
|
+
else:
|
321
|
+
# A single image
|
322
|
+
pixel_values, image_hash, image_size, image_grid_thw = (
|
323
|
+
await self._process_single_image(image_data[0])
|
324
|
+
)
|
325
|
+
image_hashes = [image_hash]
|
326
|
+
image_sizes = [image_size]
|
327
|
+
image_grid_thws = [image_grid_thw]
|
328
|
+
elif isinstance(image_data, str):
|
329
|
+
# A single image
|
330
|
+
pixel_values, image_hash, image_size, image_grid_thw = (
|
331
|
+
await self._process_single_image(image_data)
|
332
|
+
)
|
333
|
+
image_hashes = [image_hash]
|
334
|
+
image_sizes = [image_size]
|
335
|
+
image_grid_thws = [image_grid_thw]
|
336
|
+
else:
|
337
|
+
raise ValueError(f"Invalid image data: {image_data}")
|
338
|
+
|
339
|
+
return {
|
340
|
+
"pixel_values": pixel_values,
|
341
|
+
"image_hashes": image_hashes,
|
342
|
+
"image_sizes": image_sizes,
|
343
|
+
"modalities": request_obj.modalities,
|
344
|
+
"image_grid_thws": image_grid_thws,
|
345
|
+
}
|
346
|
+
|
347
|
+
|
180
348
|
def get_image_processor(
|
181
|
-
hf_config, server_args: ServerArgs,
|
349
|
+
hf_config, server_args: ServerArgs, processor
|
182
350
|
) -> BaseImageProcessor:
|
183
|
-
|
351
|
+
if "MllamaForConditionalGeneration" in hf_config.architectures:
|
352
|
+
return MllamaImageProcessor(hf_config, server_args, processor)
|
353
|
+
elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
|
354
|
+
return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
|
355
|
+
else:
|
356
|
+
return LlavaImageProcessor(hf_config, server_args, processor.image_processor)
|
184
357
|
|
185
358
|
|
186
359
|
def get_dummy_image_processor():
|
sglang/srt/managers/io_struct.py
CHANGED