sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. sglang/bench_latency.py +2 -1
  2. sglang/lang/chat_template.py +17 -0
  3. sglang/launch_server_llavavid.py +1 -1
  4. sglang/srt/configs/__init__.py +3 -0
  5. sglang/srt/configs/model_config.py +27 -2
  6. sglang/srt/configs/qwen2vl.py +133 -0
  7. sglang/srt/constrained/fsm_cache.py +10 -3
  8. sglang/srt/conversation.py +27 -0
  9. sglang/srt/hf_transformers_utils.py +16 -1
  10. sglang/srt/layers/attention/__init__.py +16 -5
  11. sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
  12. sglang/srt/layers/attention/flashinfer_backend.py +174 -54
  13. sglang/srt/layers/attention/triton_backend.py +22 -6
  14. sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
  15. sglang/srt/layers/linear.py +89 -63
  16. sglang/srt/layers/logits_processor.py +5 -5
  17. sglang/srt/layers/rotary_embedding.py +112 -0
  18. sglang/srt/layers/sampler.py +51 -39
  19. sglang/srt/lora/lora.py +3 -1
  20. sglang/srt/managers/data_parallel_controller.py +1 -1
  21. sglang/srt/managers/detokenizer_manager.py +4 -0
  22. sglang/srt/managers/image_processor.py +186 -13
  23. sglang/srt/managers/io_struct.py +10 -0
  24. sglang/srt/managers/schedule_batch.py +238 -68
  25. sglang/srt/managers/scheduler.py +69 -50
  26. sglang/srt/managers/tokenizer_manager.py +24 -4
  27. sglang/srt/managers/tp_worker.py +26 -111
  28. sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
  29. sglang/srt/mem_cache/memory_pool.py +56 -10
  30. sglang/srt/mem_cache/radix_cache.py +4 -3
  31. sglang/srt/model_executor/cuda_graph_runner.py +87 -28
  32. sglang/srt/model_executor/forward_batch_info.py +83 -3
  33. sglang/srt/model_executor/model_runner.py +32 -11
  34. sglang/srt/models/chatglm.py +3 -3
  35. sglang/srt/models/deepseek_v2.py +2 -2
  36. sglang/srt/models/mllama.py +1004 -0
  37. sglang/srt/models/qwen2_vl.py +724 -0
  38. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
  39. sglang/srt/sampling/sampling_batch_info.py +13 -3
  40. sglang/srt/sampling/sampling_params.py +5 -7
  41. sglang/srt/server.py +12 -0
  42. sglang/srt/server_args.py +10 -0
  43. sglang/srt/utils.py +22 -0
  44. sglang/test/run_eval.py +2 -0
  45. sglang/test/runners.py +20 -1
  46. sglang/test/srt/sampling/penaltylib/utils.py +1 -0
  47. sglang/test/test_utils.py +100 -3
  48. sglang/version.py +1 -1
  49. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
  50. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
  51. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
  52. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
  53. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0
@@ -33,26 +33,32 @@ def init_global_processor(server_args: ServerArgs):
33
33
 
34
34
 
35
35
  class BaseImageProcessor(ABC):
36
+ def __init__(self, hf_config, server_args, _processor):
37
+ self.hf_config = hf_config
38
+ self._processor = _processor
39
+ self.executor = concurrent.futures.ProcessPoolExecutor(
40
+ initializer=init_global_processor,
41
+ mp_context=mp.get_context("fork"),
42
+ initargs=(server_args,),
43
+ max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
44
+ )
45
+
36
46
  @abstractmethod
37
- async def process_images_async(self, image_data, **kwargs):
47
+ async def process_images_async(self, image_data, input_text, **kwargs):
38
48
  pass
39
49
 
40
50
 
41
51
  class DummyImageProcessor(BaseImageProcessor):
52
+ def __init__(self):
53
+ pass
54
+
42
55
  async def process_images_async(self, *args, **kwargs):
43
56
  return None
44
57
 
45
58
 
46
59
  class LlavaImageProcessor(BaseImageProcessor):
47
- def __init__(self, hf_config, server_args, _image_processor):
48
- self.hf_config = hf_config
49
- self._image_processor = _image_processor
50
- self.executor = concurrent.futures.ProcessPoolExecutor(
51
- initializer=init_global_processor,
52
- mp_context=mp.get_context("fork"),
53
- initargs=(server_args,),
54
- max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
55
- )
60
+ def __init__(self, hf_config, server_args, _processor):
61
+ super().__init__(hf_config, server_args, _processor)
56
62
 
57
63
  @staticmethod
58
64
  def _process_single_image_task(
@@ -119,7 +125,7 @@ class LlavaImageProcessor(BaseImageProcessor):
119
125
  )
120
126
 
121
127
  async def process_images_async(
122
- self, image_data: List[Union[str, bytes]], request_obj
128
+ self, image_data: List[Union[str, bytes]], input_text, request_obj
123
129
  ):
124
130
  if not image_data:
125
131
  return None
@@ -177,10 +183,177 @@ class LlavaImageProcessor(BaseImageProcessor):
177
183
  }
178
184
 
179
185
 
186
+ class MllamaImageProcessor(BaseImageProcessor):
187
+ def __init__(self, hf_config, server_args, _processor):
188
+ super().__init__(hf_config, server_args, _processor)
189
+
190
+ @staticmethod
191
+ def _process_single_image_task(images, input_text):
192
+ # input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
193
+ return global_processor(images, input_text, return_tensors="pt")
194
+
195
+ async def _process_single_image(self, images, input_text):
196
+ if self.executor is not None:
197
+ loop = asyncio.get_event_loop()
198
+ image_inputs = await loop.run_in_executor(
199
+ self.executor,
200
+ MllamaImageProcessor._process_single_image_task,
201
+ images,
202
+ input_text,
203
+ )
204
+ else:
205
+ image_inputs = self._processor(images, input_text, return_tensors="pt")
206
+
207
+ return image_inputs
208
+
209
+ async def process_images_async(
210
+ self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
211
+ ):
212
+ if not image_data:
213
+ return None
214
+
215
+ if isinstance(input_text, list):
216
+ assert len(input_text) and isinstance(input_text[0], int)
217
+ input_text = self._processor.tokenizer.decode(input_text)
218
+
219
+ if not isinstance(image_data, list):
220
+ image_data = [image_data]
221
+
222
+ if len(image_data) > 0:
223
+ images = [load_image(image)[0] for image in image_data]
224
+ else:
225
+ images = load_image(image_data[0])[0]
226
+
227
+ image_inputs = await self._process_single_image(images, input_text)
228
+ image_inputs["image_hashes"] = [hash(str(image_data))]
229
+ image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
230
+
231
+ return image_inputs
232
+
233
+
234
+ class Qwen2VLImageProcessor(BaseImageProcessor):
235
+ def __init__(self, hf_config, server_args, _image_processor):
236
+ self.hf_config = hf_config
237
+ self._image_processor = _image_processor
238
+ self.executor = concurrent.futures.ProcessPoolExecutor(
239
+ initializer=init_global_processor,
240
+ mp_context=mp.get_context("fork"),
241
+ initargs=(server_args,),
242
+ max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
243
+ )
244
+
245
+ @staticmethod
246
+ def _process_single_image_task(
247
+ image_data: Union[str, bytes],
248
+ image_processor=None,
249
+ ):
250
+ image_processor = image_processor or global_processor.image_processor
251
+
252
+ try:
253
+ image, image_size = load_image(image_data)
254
+ if image_size is not None:
255
+ # It is a video with multiple images
256
+ image_hash = hash(image_data)
257
+ process_result = image_processor(image)
258
+ pixel_values, image_grid_thws = (
259
+ process_result["pixel_values"],
260
+ process_result["image_grid_thw"][0],
261
+ )
262
+ for _ in range(len(pixel_values)):
263
+ pixel_values[_] = pixel_values[_].astype(np.float16)
264
+ pixel_values = np.stack(pixel_values, axis=0)
265
+ image_grid_thws = np.stack(image_grid_thws, axis=0)
266
+ return pixel_values, image_hash, image_size, image_grid_thws
267
+ else:
268
+ # It is an image
269
+ image_hash = hash(image_data)
270
+ process_result = image_processor(image)
271
+ pixel_values, image_grid_thws = (
272
+ process_result["pixel_values"],
273
+ process_result["image_grid_thw"][0],
274
+ )
275
+ if isinstance(pixel_values, np.ndarray):
276
+ pixel_values = pixel_values.astype(np.float16)
277
+
278
+ return pixel_values, image_hash, image.size, image_grid_thws
279
+ except Exception:
280
+ logger.error("Exception in TokenizerManager:\n" + get_exception_traceback())
281
+
282
+ async def _process_single_image(self, image_data: Union[bytes, str]):
283
+ if self.executor is not None:
284
+ loop = asyncio.get_event_loop()
285
+ return await loop.run_in_executor(
286
+ self.executor,
287
+ Qwen2VLImageProcessor._process_single_image_task,
288
+ image_data,
289
+ )
290
+ else:
291
+ return self._process_single_image_task(image_data)
292
+
293
+ async def process_images_async(
294
+ self, image_data: List[Union[str, bytes]], input_text, request_obj
295
+ ):
296
+ if not image_data:
297
+ return None
298
+
299
+ if isinstance(image_data, list) and len(image_data) > 0:
300
+ # Multiple images
301
+ if len(image_data) > 1:
302
+ pixel_values, image_hashes, image_sizes, image_grid_thws = (
303
+ [],
304
+ [],
305
+ [],
306
+ [],
307
+ )
308
+ res = []
309
+ for img_data in image_data:
310
+ res.append(self._process_single_image(img_data))
311
+ res = await asyncio.gather(*res)
312
+ for pixel_v, image_h, image_s, image_thw in res:
313
+ pixel_values.append(pixel_v)
314
+ image_hashes.append(image_h)
315
+ image_sizes.append(image_s)
316
+ image_grid_thws.append(image_thw)
317
+
318
+ if isinstance(pixel_values[0], np.ndarray):
319
+ pixel_values = np.concatenate(pixel_values, axis=0)
320
+ else:
321
+ # A single image
322
+ pixel_values, image_hash, image_size, image_grid_thw = (
323
+ await self._process_single_image(image_data[0])
324
+ )
325
+ image_hashes = [image_hash]
326
+ image_sizes = [image_size]
327
+ image_grid_thws = [image_grid_thw]
328
+ elif isinstance(image_data, str):
329
+ # A single image
330
+ pixel_values, image_hash, image_size, image_grid_thw = (
331
+ await self._process_single_image(image_data)
332
+ )
333
+ image_hashes = [image_hash]
334
+ image_sizes = [image_size]
335
+ image_grid_thws = [image_grid_thw]
336
+ else:
337
+ raise ValueError(f"Invalid image data: {image_data}")
338
+
339
+ return {
340
+ "pixel_values": pixel_values,
341
+ "image_hashes": image_hashes,
342
+ "image_sizes": image_sizes,
343
+ "modalities": request_obj.modalities,
344
+ "image_grid_thws": image_grid_thws,
345
+ }
346
+
347
+
180
348
  def get_image_processor(
181
- hf_config, server_args: ServerArgs, _image_processor
349
+ hf_config, server_args: ServerArgs, processor
182
350
  ) -> BaseImageProcessor:
183
- return LlavaImageProcessor(hf_config, server_args, _image_processor)
351
+ if "MllamaForConditionalGeneration" in hf_config.architectures:
352
+ return MllamaImageProcessor(hf_config, server_args, processor)
353
+ elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
354
+ return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
355
+ else:
356
+ return LlavaImageProcessor(hf_config, server_args, processor.image_processor)
184
357
 
185
358
 
186
359
  def get_dummy_image_processor():
@@ -353,3 +353,13 @@ class AbortReq:
353
353
  class ProfileReq(Enum):
354
354
  START_PROFILE = 1
355
355
  STOP_PROFILE = 2
356
+
357
+
358
+ @dataclass
359
+ class GetMemPoolSizeReq:
360
+ pass
361
+
362
+
363
+ @dataclass
364
+ class GetMemPoolSizeReqOutput:
365
+ size: int