sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. sglang/bench_serving.py +49 -7
  2. sglang/lang/chat_template.py +24 -0
  3. sglang/srt/_custom_ops.py +59 -92
  4. sglang/srt/configs/model_config.py +5 -0
  5. sglang/srt/constrained/base_grammar_backend.py +5 -1
  6. sglang/srt/conversation.py +29 -4
  7. sglang/srt/custom_op.py +5 -0
  8. sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
  9. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  10. sglang/srt/entrypoints/engine.py +0 -5
  11. sglang/srt/layers/attention/flashattention_backend.py +678 -83
  12. sglang/srt/layers/attention/flashinfer_backend.py +5 -7
  13. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
  14. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  15. sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
  16. sglang/srt/layers/moe/ep_moe/layer.py +79 -80
  17. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
  18. sglang/srt/layers/moe/fused_moe_native.py +5 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  21. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
  27. sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
  30. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  31. sglang/srt/layers/moe/topk.py +49 -3
  32. sglang/srt/layers/quantization/__init__.py +5 -1
  33. sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  34. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
  35. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
  36. sglang/srt/layers/quantization/fp8.py +3 -1
  37. sglang/srt/layers/quantization/fp8_utils.py +1 -4
  38. sglang/srt/layers/quantization/moe_wna16.py +503 -0
  39. sglang/srt/layers/quantization/utils.py +1 -1
  40. sglang/srt/layers/quantization/w8a8_int8.py +2 -0
  41. sglang/srt/layers/radix_attention.py +2 -0
  42. sglang/srt/layers/rotary_embedding.py +63 -12
  43. sglang/srt/managers/cache_controller.py +34 -11
  44. sglang/srt/managers/mm_utils.py +202 -156
  45. sglang/srt/managers/multimodal_processor.py +0 -2
  46. sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
  47. sglang/srt/managers/multimodal_processors/clip.py +7 -26
  48. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
  49. sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
  50. sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
  51. sglang/srt/managers/multimodal_processors/llava.py +34 -14
  52. sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
  53. sglang/srt/managers/multimodal_processors/mlama.py +10 -23
  54. sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
  55. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
  56. sglang/srt/managers/schedule_batch.py +185 -128
  57. sglang/srt/managers/scheduler.py +4 -4
  58. sglang/srt/managers/tokenizer_manager.py +1 -1
  59. sglang/srt/managers/utils.py +1 -6
  60. sglang/srt/mem_cache/hiradix_cache.py +62 -52
  61. sglang/srt/mem_cache/memory_pool.py +72 -6
  62. sglang/srt/mem_cache/paged_allocator.py +39 -0
  63. sglang/srt/metrics/collector.py +23 -53
  64. sglang/srt/model_executor/cuda_graph_runner.py +8 -6
  65. sglang/srt/model_executor/forward_batch_info.py +10 -10
  66. sglang/srt/model_executor/model_runner.py +60 -57
  67. sglang/srt/model_loader/loader.py +8 -0
  68. sglang/srt/models/clip.py +12 -7
  69. sglang/srt/models/deepseek_janus_pro.py +10 -15
  70. sglang/srt/models/deepseek_v2.py +212 -121
  71. sglang/srt/models/deepseek_vl2.py +105 -104
  72. sglang/srt/models/gemma3_mm.py +14 -80
  73. sglang/srt/models/llama.py +16 -5
  74. sglang/srt/models/llama4.py +420 -0
  75. sglang/srt/models/llava.py +31 -19
  76. sglang/srt/models/llavavid.py +16 -7
  77. sglang/srt/models/minicpmo.py +63 -147
  78. sglang/srt/models/minicpmv.py +17 -27
  79. sglang/srt/models/mllama.py +29 -14
  80. sglang/srt/models/mllama4.py +154 -0
  81. sglang/srt/models/qwen2.py +9 -6
  82. sglang/srt/models/qwen2_5_vl.py +21 -31
  83. sglang/srt/models/qwen2_vl.py +20 -21
  84. sglang/srt/openai_api/adapter.py +18 -6
  85. sglang/srt/platforms/interface.py +371 -0
  86. sglang/srt/server_args.py +99 -14
  87. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
  88. sglang/srt/speculative/eagle_utils.py +140 -28
  89. sglang/srt/speculative/eagle_worker.py +93 -24
  90. sglang/srt/utils.py +104 -51
  91. sglang/test/test_custom_ops.py +55 -0
  92. sglang/test/test_utils.py +13 -26
  93. sglang/utils.py +2 -2
  94. sglang/version.py +1 -1
  95. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
  96. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
  97. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
  98. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
  99. {sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
- import asyncio
2
1
  from typing import List, Union
3
2
 
4
3
  import torch
4
+ from transformers import BaseImageProcessorFast
5
5
 
6
6
  from sglang.srt.managers.multimodal_processors.base_processor import (
7
7
  BaseMultimodalProcessor,
8
8
  MultimodalSpecialTokens,
9
- get_global_processor,
10
9
  )
10
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
11
  from sglang.srt.models.minicpmo import MiniCPMO
12
12
  from sglang.srt.models.minicpmv import MiniCPMV
13
13
 
@@ -21,19 +21,23 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
21
21
  self.image_token = "(<image>./</image>)"
22
22
  self.audio_token = "(<audio>./</audio>)"
23
23
 
24
- @staticmethod
25
- def _process_data_task(input_text, images=None, audios=None):
24
+ def process_data_task(self, input_text, images=None, audios=None):
26
25
 
27
26
  if isinstance(images, list) and len(images) == 0:
28
27
  images = None
29
28
  if isinstance(audios, list) and len(audios) == 0:
30
29
  audios = None
31
- result = get_global_processor().__call__(
30
+ processor = self._processor
31
+ args = {}
32
+ if isinstance(processor, BaseImageProcessorFast):
33
+ args["device"] = "cuda"
34
+ result = self._processor.__call__(
32
35
  text=input_text,
33
36
  images=images,
34
37
  audios=audios,
35
38
  return_tensors="pt",
36
39
  chunk_input=True,
40
+ **args,
37
41
  )
38
42
  return {
39
43
  "input_ids": result.input_ids,
@@ -44,23 +48,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
44
48
  "audio_bounds": getattr(result, "audio_bounds", None),
45
49
  }
46
50
 
47
- async def _process_data(self, images, input_text, audios=None):
48
- if self.executor is not None:
49
- loop = asyncio.get_event_loop()
50
- multimodal_data_inputs = await loop.run_in_executor(
51
- self.executor,
52
- MiniCPMMultimodalProcessor._process_data_task,
53
- input_text,
54
- images,
55
- audios,
56
- )
57
- else:
58
- multimodal_data_inputs = self._processor(
59
- images=images, text=input_text, audios=audios, return_tensors="pt"
60
- )
61
-
62
- return multimodal_data_inputs
63
-
64
51
  async def process_mm_data_async(
65
52
  self,
66
53
  image_data: List[Union[str, bytes]],
@@ -77,7 +64,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
77
64
  audio_data = [audio_data]
78
65
 
79
66
  base_output = self.load_mm_data(
80
- input_ids=input_ids,
67
+ prompt=input_ids,
81
68
  max_req_input_len=max_req_input_len,
82
69
  audio_data=audio_data,
83
70
  image_data=image_data,
@@ -88,9 +75,9 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
88
75
  if base_output is None:
89
76
  return None
90
77
 
91
- res = await self._process_data(
92
- images=base_output.images,
78
+ res = self.process_mm_data(
93
79
  input_text=base_output.input_text,
80
+ images=base_output.images,
94
81
  audios=base_output.audios,
95
82
  )
96
83
 
@@ -142,23 +129,33 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
142
129
  tgt_sizes_flat += [tgt_n]
143
130
 
144
131
  pixel_values = pixel_values_flat
145
- if len(tgt_sizes_flat) == 0:
146
- tgt_sizes = None
147
- else:
148
- tgt_sizes = torch.stack(tgt_sizes_flat)
149
- if not isinstance(res["audio_features"], list):
150
- res["audio_features"] = [res["audio_features"]]
132
+
133
+ items = []
134
+ if len(pixel_values) != 0:
135
+ item = MultimodalDataItem(
136
+ pixel_values=pixel_values,
137
+ tgt_size=tgt_sizes_flat,
138
+ modality=Modality.IMAGE,
139
+ )
140
+ items += [item]
141
+
142
+ if (
143
+ "audio_features" in res
144
+ and res["audio_features"] is not None
145
+ and len(res["audio_features"]) != 0
146
+ ):
147
+ item = MultimodalDataItem(
148
+ audio_features=[res["audio_features"]],
149
+ audio_feature_lens=res["audio_feature_lens"],
150
+ modality=Modality.AUDIO,
151
+ )
152
+ items += [item]
153
+
151
154
  return {
155
+ "mm_items": items,
152
156
  "input_ids": res["input_ids"].flatten().tolist(),
153
- "pixel_values": pixel_values,
154
- "tgt_sizes": tgt_sizes,
155
- "data_hashes": base_output.mm_data_hashes,
156
- "modalities": request_obj.modalities or ["image"],
157
157
  "audio_start_id": audio_start_id,
158
158
  "audio_end_id": audio_end_id,
159
- "audio_features": res["audio_features"],
160
- "audio_bounds": res["audio_bounds"],
161
- "audio_feature_lens": res["audio_feature_lens"],
162
159
  "im_token_id": im_token_id,
163
160
  "im_start_id": tokenizer.im_start_id,
164
161
  "im_end_id": tokenizer.im_end_id,
@@ -1,10 +1,9 @@
1
- import asyncio
2
1
  from typing import List, Union
3
2
 
4
3
  from sglang.srt.managers.multimodal_processors.base_processor import (
5
4
  BaseMultimodalProcessor,
6
- get_global_processor,
7
5
  )
6
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
8
7
  from sglang.srt.models.mllama import MllamaForConditionalGeneration
9
8
  from sglang.srt.utils import load_image
10
9
 
@@ -15,25 +14,6 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
15
14
  def __init__(self, hf_config, server_args, _processor):
16
15
  super().__init__(hf_config, server_args, _processor)
17
16
 
18
- @staticmethod
19
- def _process_single_image_task(images, input_text):
20
- # input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
21
- return get_global_processor()(images, input_text, return_tensors="pt")
22
-
23
- async def _process_single_image(self, images, input_text):
24
- if self.executor is not None:
25
- loop = asyncio.get_event_loop()
26
- image_inputs = await loop.run_in_executor(
27
- self.executor,
28
- MllamaImageProcessor._process_single_image_task,
29
- images,
30
- input_text,
31
- )
32
- else:
33
- image_inputs = self._processor(images, input_text, return_tensors="pt")
34
-
35
- return image_inputs
36
-
37
17
  async def process_mm_data_async(
38
18
  self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
39
19
  ):
@@ -52,8 +32,15 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
52
32
  else:
53
33
  images = load_image(image_data[0])[0]
54
34
 
55
- image_inputs = await self._process_single_image(images, input_text)
56
- image_inputs["data_hashes"] = [hash(str(image_data))]
35
+ image_inputs = self.process_mm_data(input_text=input_text, images=images)
57
36
  image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
37
+ image_inputs["mm_items"] = [
38
+ MultimodalDataItem(
39
+ pixel_values=image_inputs["pixel_values"],
40
+ aspect_ratio_id=image_inputs["aspect_ratio_ids"],
41
+ aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
42
+ modality=Modality.IMAGE,
43
+ )
44
+ ]
58
45
 
59
46
  return image_inputs
@@ -0,0 +1,161 @@
1
+ from typing import List, Mapping, Optional, Tuple, Union
2
+
3
+ import torch
4
+ from PIL import Image
5
+ from transformers import Llama4Processor
6
+ from transformers.image_utils import SizeDict
7
+ from transformers.models.llama4.image_processing_llama4 import (
8
+ find_supported_resolutions,
9
+ get_best_fit,
10
+ )
11
+
12
+ from sglang.srt.managers.multimodal_processors.base_processor import (
13
+ BaseMultimodalProcessor,
14
+ MultimodalSpecialTokens,
15
+ )
16
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
17
+ from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
18
+ from sglang.srt.utils import load_image
19
+
20
+
21
+ class Mllama4ImageProcessor(BaseMultimodalProcessor):
22
+ models = [Llama4ForConditionalGeneration]
23
+
24
+ def __init__(self, hf_config, server_args, _processor):
25
+ super().__init__(hf_config, server_args, _processor)
26
+ self.vision_config = hf_config.vision_config
27
+ self.text_config = hf_config.text_config
28
+ self.multimodal_tokens = MultimodalSpecialTokens(
29
+ image_token=_processor.image_token
30
+ )
31
+
32
+ async def process_mm_data_async(
33
+ self,
34
+ image_data: List[Union[str, bytes]],
35
+ input_text,
36
+ max_req_input_len=None,
37
+ *args,
38
+ **kwargs,
39
+ ):
40
+ if not image_data:
41
+ return None
42
+
43
+ if isinstance(input_text, list):
44
+ assert len(input_text) and isinstance(input_text[0], int)
45
+ input_text = self._processor.tokenizer.decode(input_text)
46
+
47
+ # Process images and text using the base processor's load_mm_data method
48
+ processed_data = self.load_mm_data(
49
+ prompt=input_text,
50
+ multimodal_tokens=self.multimodal_tokens,
51
+ max_req_input_len=max_req_input_len or 4096,
52
+ image_data=image_data,
53
+ return_text=True,
54
+ )
55
+
56
+ # Process the images using the processor
57
+ processor = Llama4Processor.from_pretrained(
58
+ self.server_args.model_path, **kwargs
59
+ )
60
+
61
+ # Process the prompt and images
62
+ image_inputs = processor(
63
+ text=processed_data.input_text,
64
+ images=processed_data.images,
65
+ return_tensors="pt",
66
+ )
67
+
68
+ # Handle image resolutions and aspect ratios
69
+ if "pixel_values" in image_inputs:
70
+ image_processor = processor.image_processor
71
+ tokenizer = self._processor.tokenizer
72
+
73
+ # Calculate tile size and find supported resolutions
74
+ tile_size = self.vision_config.image_size
75
+ max_num_tiles = getattr(self.vision_config, "max_patches", 1)
76
+
77
+ possible_resolutions = find_supported_resolutions(
78
+ max_num_chunks=max_num_tiles,
79
+ patch_size=SizeDict(height=tile_size, width=tile_size),
80
+ )
81
+
82
+ # Find best fit for each image
83
+ best_fit_sizes = [
84
+ get_best_fit(
85
+ (image.size[1], image.size[0]), # (height, width)
86
+ torch.tensor(possible_resolutions),
87
+ resize_to_max_canvas=image_processor.resize_to_max_canvas,
88
+ )
89
+ for image in processed_data.images
90
+ ]
91
+
92
+ # Calculate aspect ratios and patches per image
93
+ aspect_ratios = [
94
+ (image_size[0] // tile_size, image_size[1] // tile_size)
95
+ for image_size in best_fit_sizes
96
+ ]
97
+
98
+ patches_per_image = [
99
+ 1 if r_h * r_w == 1 else 1 + r_h * r_w for (r_h, r_w) in aspect_ratios
100
+ ]
101
+
102
+ # Add to image_inputs
103
+ image_inputs["aspect_ratios"] = aspect_ratios
104
+ image_inputs["patches_per_image"] = torch.tensor(patches_per_image)
105
+
106
+ # Process embed_is_patch
107
+ vocab = tokenizer.get_vocab()
108
+ patch_id = vocab.get(processor.img_patch_token, -1)
109
+ image_end_id = vocab.get(processor.end_of_img_token, -1)
110
+
111
+ if patch_id != -1 and image_end_id != -1:
112
+ input_ids = image_inputs["input_ids"].view(-1)
113
+
114
+ # Remove BOS token if present
115
+ if input_ids.size(0) > 0 and input_ids[0] == tokenizer.bos_token_id:
116
+ input_ids = input_ids[1:]
117
+
118
+ # Find image end indices and split input_ids
119
+ image_end_indices = (input_ids == image_end_id).nonzero().view(-1)
120
+
121
+ if image_end_indices.size(0) > 0:
122
+ # Split at image boundaries
123
+ split_indices = (image_end_indices + 1)[:-1]
124
+ split_input_ids = torch.tensor_split(input_ids, split_indices)
125
+ split_input_ids = [x for x in split_input_ids if x.numel() > 0]
126
+
127
+ # Create embed_is_patch for each image
128
+ embed_is_patch = []
129
+ for per_image_input_ids in split_input_ids:
130
+ embed_is_patch.append(per_image_input_ids == patch_id)
131
+
132
+ image_inputs["embed_is_patch"] = embed_is_patch
133
+
134
+ # Convert to the format expected by SGLang
135
+ image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
136
+
137
+ # Add metadata for image processing
138
+ image_inputs["mm_items"] = [
139
+ MultimodalDataItem(
140
+ pixel_values=image_inputs["pixel_values"],
141
+ modality=Modality.IMAGE,
142
+ # Add additional metadata needed for Llama4 vision processing
143
+ embed_is_patch=image_inputs.get("embed_is_patch", None),
144
+ aspect_ratios=image_inputs.get("aspect_ratios", None),
145
+ patches_per_image=image_inputs.get("patches_per_image", None),
146
+ )
147
+ ]
148
+
149
+ return image_inputs
150
+
151
+ def get_patch_per_chunk(self):
152
+ """Calculate patches per chunk based on vision config"""
153
+ image_size = self.vision_config.image_size
154
+ patch_size = self.vision_config.patch_size
155
+
156
+ assert (
157
+ image_size % patch_size == 0
158
+ ), f"chunk size {image_size} should be multiple of patch_size {patch_size}"
159
+
160
+ ds_ratio = int(round(1.0 / (self.vision_config.pixel_shuffle_ratio**2)))
161
+ return (image_size // patch_size) ** 2 // ds_ratio
@@ -1,18 +1,17 @@
1
1
  import asyncio
2
2
  import math
3
- import time
4
3
  from typing import List, Union
5
4
 
6
5
  import torch
7
6
  from PIL import Image
8
7
 
9
- from sglang.srt.managers.multimodal_processor import (
8
+ from sglang.srt.managers.multimodal_processors.base_processor import (
10
9
  BaseMultimodalProcessor as SGLangBaseProcessor,
11
10
  )
12
11
  from sglang.srt.managers.multimodal_processors.base_processor import (
13
12
  MultimodalSpecialTokens,
14
- get_global_processor,
15
13
  )
14
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
16
15
  from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
17
16
  from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
18
17
 
@@ -34,45 +33,15 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
34
33
  self.MAX_PIXELS = 16384 * 28 * 28
35
34
  self.MAX_RATIO = 200
36
35
 
37
- @staticmethod
38
- def _process_images_task(images, input_text, _hf_config):
39
- if isinstance(images, list) and len(images) == 0:
40
- images = None
41
- result = get_global_processor().__call__(
42
- text=[input_text], images=images, padding=True, return_tensors="pt"
43
- )
44
-
45
- return {
46
- "input_ids": result.input_ids,
47
- "pixel_values": getattr(result, "pixel_values", None),
48
- "image_grid_thw": getattr(result, "image_grid_thw", None),
49
- "second_per_grid_ts": getattr(result, "second_per_grid_ts", None),
50
- "video_grid_thws": getattr(result, "video_grid_thws", None),
51
- }
52
-
53
- async def _process_single_image(self, images, input_text) -> dict:
54
- if self.executor is not None:
55
- loop = asyncio.get_event_loop()
56
- return await loop.run_in_executor(
57
- self.executor,
58
- Qwen2_5VLImageProcessor._process_images_task,
59
- images,
60
- input_text,
61
- self.hf_config,
62
- )
63
- else:
64
- return self._process_images_task(images, input_text, self.hf_config)
65
-
66
36
  async def process_mm_data_async(
67
37
  self,
68
38
  image_data: List[Union[str, bytes]],
69
- input_ids,
39
+ prompt,
70
40
  request_obj,
71
41
  max_req_input_len,
72
42
  *args,
73
43
  **kwargs,
74
44
  ):
75
- start = time.time()
76
45
  if not image_data:
77
46
  return None
78
47
  if isinstance(image_data, str):
@@ -80,7 +49,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
80
49
 
81
50
  image_token = self.IMAGE_TOKEN
82
51
  base_output = self.load_mm_data(
83
- input_ids=input_ids,
52
+ prompt=prompt,
84
53
  image_data=image_data,
85
54
  multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
86
55
  max_req_input_len=max_req_input_len,
@@ -144,24 +113,32 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
144
113
  """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
145
114
  return math.floor(number / factor) * factor
146
115
 
147
- images = [resize_image(image) for image in base_output.images]
116
+ async def resize_image_async(image):
117
+ return resize_image(image)
148
118
 
149
- ret = await self._process_single_image(
150
- images=images, input_text=base_output.input_text
119
+ resize_tasks = [resize_image_async(image) for image in base_output.images]
120
+ resized_images = await asyncio.gather(*resize_tasks)
121
+
122
+ ret = self.process_mm_data(
123
+ input_text=base_output.input_text,
124
+ images=resized_images,
151
125
  )
152
126
 
153
127
  image_grid_thws = torch.concat([ret["image_grid_thw"]])
154
- video_grid_thws = None
155
128
  return {
156
129
  "input_ids": ret["input_ids"].flatten().tolist(),
157
- "pixel_values": ret["pixel_values"],
158
- "data_hashes": base_output.mm_data_hashes,
159
- "modalities": request_obj.modalities or ["image"],
160
- "image_grid_thws": image_grid_thws,
161
- "video_grid_thws": video_grid_thws,
130
+ "mm_items": [
131
+ MultimodalDataItem(
132
+ pixel_values=ret["pixel_values"],
133
+ image_grid_thws=image_grid_thws,
134
+ # TODO
135
+ video_grid_thws=None,
136
+ second_per_grid_ts=ret.get("second_per_grid_ts", None),
137
+ modality=Modality.IMAGE,
138
+ )
139
+ ],
162
140
  "im_start_id": self.IM_START_TOKEN_ID,
163
141
  "im_end_id": self.IM_END_TOKEN_ID,
164
142
  "im_token_id": self.image_token_id,
165
143
  "video_token_id": self.video_token_id,
166
- "second_per_grid_ts": ret["second_per_grid_ts"],
167
144
  }