sglang 0.4.4.post2__py3-none-any.whl → 0.4.4.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. sglang/bench_serving.py +72 -10
  2. sglang/srt/_custom_ops.py +59 -92
  3. sglang/srt/configs/deepseekvl2.py +10 -1
  4. sglang/srt/configs/model_config.py +6 -16
  5. sglang/srt/constrained/base_grammar_backend.py +5 -1
  6. sglang/srt/custom_op.py +5 -0
  7. sglang/srt/distributed/device_communicators/custom_all_reduce.py +28 -80
  8. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
  9. sglang/srt/distributed/parallel_state.py +32 -5
  10. sglang/srt/entrypoints/engine.py +0 -5
  11. sglang/srt/entrypoints/http_server.py +7 -1
  12. sglang/srt/entrypoints/verl_engine.py +2 -0
  13. sglang/srt/function_call_parser.py +0 -1
  14. sglang/srt/layers/attention/flashattention_backend.py +582 -125
  15. sglang/srt/layers/attention/flashinfer_backend.py +5 -7
  16. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
  17. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  18. sglang/srt/layers/dp_attention.py +12 -1
  19. sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
  20. sglang/srt/layers/moe/ep_moe/layer.py +79 -80
  21. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
  22. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
  23. sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  24. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  25. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +403 -47
  26. sglang/srt/layers/moe/topk.py +79 -6
  27. sglang/srt/layers/quantization/__init__.py +137 -165
  28. sglang/srt/layers/quantization/awq.py +200 -0
  29. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
  30. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
  31. sglang/srt/layers/quantization/fp8_kernel.py +2 -1
  32. sglang/srt/layers/quantization/fp8_utils.py +1 -4
  33. sglang/srt/layers/quantization/gptq.py +30 -40
  34. sglang/srt/layers/quantization/moe_wna16.py +501 -0
  35. sglang/srt/layers/quantization/utils.py +1 -1
  36. sglang/srt/layers/quantization/w8a8_fp8.py +1 -1
  37. sglang/srt/lora/backend/base_backend.py +4 -4
  38. sglang/srt/lora/backend/flashinfer_backend.py +12 -9
  39. sglang/srt/lora/backend/triton_backend.py +5 -8
  40. sglang/srt/lora/layers.py +19 -33
  41. sglang/srt/lora/lora_manager.py +20 -7
  42. sglang/srt/lora/mem_pool.py +12 -6
  43. sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
  44. sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
  45. sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
  46. sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
  47. sglang/srt/lora/utils.py +6 -0
  48. sglang/srt/managers/cache_controller.py +34 -11
  49. sglang/srt/managers/io_struct.py +4 -2
  50. sglang/srt/managers/mm_utils.py +202 -156
  51. sglang/srt/managers/multimodal_processor.py +0 -2
  52. sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
  53. sglang/srt/managers/multimodal_processors/clip.py +44 -0
  54. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
  55. sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
  56. sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
  57. sglang/srt/managers/multimodal_processors/llava.py +34 -14
  58. sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
  59. sglang/srt/managers/multimodal_processors/mlama.py +10 -23
  60. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
  61. sglang/srt/managers/schedule_batch.py +185 -127
  62. sglang/srt/managers/scheduler.py +29 -23
  63. sglang/srt/managers/tokenizer_manager.py +1 -2
  64. sglang/srt/managers/tp_worker.py +3 -0
  65. sglang/srt/managers/utils.py +1 -6
  66. sglang/srt/mem_cache/hiradix_cache.py +62 -52
  67. sglang/srt/mem_cache/memory_pool.py +72 -6
  68. sglang/srt/mem_cache/paged_allocator.py +39 -0
  69. sglang/srt/metrics/collector.py +23 -53
  70. sglang/srt/model_executor/cuda_graph_runner.py +16 -13
  71. sglang/srt/model_executor/forward_batch_info.py +10 -10
  72. sglang/srt/model_executor/model_runner.py +64 -59
  73. sglang/srt/model_loader/loader.py +19 -1
  74. sglang/srt/model_loader/weight_utils.py +6 -3
  75. sglang/srt/models/clip.py +568 -0
  76. sglang/srt/models/deepseek_janus_pro.py +12 -17
  77. sglang/srt/models/deepseek_v2.py +339 -123
  78. sglang/srt/models/deepseek_vl2.py +105 -104
  79. sglang/srt/models/gemma3_causal.py +12 -2
  80. sglang/srt/models/gemma3_mm.py +20 -80
  81. sglang/srt/models/llama.py +4 -1
  82. sglang/srt/models/llava.py +31 -19
  83. sglang/srt/models/llavavid.py +16 -7
  84. sglang/srt/models/minicpmo.py +63 -147
  85. sglang/srt/models/minicpmv.py +17 -27
  86. sglang/srt/models/mllama.py +29 -14
  87. sglang/srt/models/qwen2.py +9 -6
  88. sglang/srt/models/qwen2_5_vl.py +21 -31
  89. sglang/srt/models/qwen2_vl.py +20 -21
  90. sglang/srt/openai_api/adapter.py +106 -93
  91. sglang/srt/openai_api/protocol.py +10 -5
  92. sglang/srt/patch_torch.py +71 -0
  93. sglang/srt/platforms/interface.py +371 -0
  94. sglang/srt/server_args.py +120 -25
  95. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
  96. sglang/srt/speculative/eagle_utils.py +140 -28
  97. sglang/srt/speculative/eagle_worker.py +94 -25
  98. sglang/srt/utils.py +137 -51
  99. sglang/test/runners.py +27 -2
  100. sglang/test/test_custom_ops.py +55 -0
  101. sglang/test/test_utils.py +14 -27
  102. sglang/utils.py +2 -2
  103. sglang/version.py +1 -1
  104. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/METADATA +10 -5
  105. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/RECORD +108 -99
  106. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/WHEEL +0 -0
  107. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/licenses/LICENSE +0 -0
  108. {sglang-0.4.4.post2.dist-info → sglang-0.4.4.post4.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
- import asyncio
2
1
  from typing import List, Union
3
2
 
4
3
  import torch
4
+ from transformers import BaseImageProcessorFast
5
5
 
6
6
  from sglang.srt.managers.multimodal_processors.base_processor import (
7
7
  BaseMultimodalProcessor,
8
8
  MultimodalSpecialTokens,
9
- get_global_processor,
10
9
  )
10
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
11
  from sglang.srt.models.minicpmo import MiniCPMO
12
12
  from sglang.srt.models.minicpmv import MiniCPMV
13
13
 
@@ -21,19 +21,23 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
21
21
  self.image_token = "(<image>./</image>)"
22
22
  self.audio_token = "(<audio>./</audio>)"
23
23
 
24
- @staticmethod
25
- def _process_data_task(input_text, images=None, audios=None):
24
+ def process_data_task(self, input_text, images=None, audios=None):
26
25
 
27
26
  if isinstance(images, list) and len(images) == 0:
28
27
  images = None
29
28
  if isinstance(audios, list) and len(audios) == 0:
30
29
  audios = None
31
- result = get_global_processor().__call__(
30
+ processor = self._processor
31
+ args = {}
32
+ if isinstance(processor, BaseImageProcessorFast):
33
+ args["device"] = "cuda"
34
+ result = self._processor.__call__(
32
35
  text=input_text,
33
36
  images=images,
34
37
  audios=audios,
35
38
  return_tensors="pt",
36
39
  chunk_input=True,
40
+ **args,
37
41
  )
38
42
  return {
39
43
  "input_ids": result.input_ids,
@@ -44,23 +48,6 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
44
48
  "audio_bounds": getattr(result, "audio_bounds", None),
45
49
  }
46
50
 
47
- async def _process_data(self, images, input_text, audios=None):
48
- if self.executor is not None:
49
- loop = asyncio.get_event_loop()
50
- multimodal_data_inputs = await loop.run_in_executor(
51
- self.executor,
52
- MiniCPMMultimodalProcessor._process_data_task,
53
- input_text,
54
- images,
55
- audios,
56
- )
57
- else:
58
- multimodal_data_inputs = self._processor(
59
- images=images, text=input_text, audios=audios, return_tensors="pt"
60
- )
61
-
62
- return multimodal_data_inputs
63
-
64
51
  async def process_mm_data_async(
65
52
  self,
66
53
  image_data: List[Union[str, bytes]],
@@ -77,7 +64,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
77
64
  audio_data = [audio_data]
78
65
 
79
66
  base_output = self.load_mm_data(
80
- input_ids=input_ids,
67
+ prompt=input_ids,
81
68
  max_req_input_len=max_req_input_len,
82
69
  audio_data=audio_data,
83
70
  image_data=image_data,
@@ -88,9 +75,9 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
88
75
  if base_output is None:
89
76
  return None
90
77
 
91
- res = await self._process_data(
92
- images=base_output.images,
78
+ res = self.process_mm_data(
93
79
  input_text=base_output.input_text,
80
+ images=base_output.images,
94
81
  audios=base_output.audios,
95
82
  )
96
83
 
@@ -142,23 +129,33 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
142
129
  tgt_sizes_flat += [tgt_n]
143
130
 
144
131
  pixel_values = pixel_values_flat
145
- if len(tgt_sizes_flat) == 0:
146
- tgt_sizes = None
147
- else:
148
- tgt_sizes = torch.stack(tgt_sizes_flat)
149
- if not isinstance(res["audio_features"], list):
150
- res["audio_features"] = [res["audio_features"]]
132
+
133
+ items = []
134
+ if len(pixel_values) != 0:
135
+ item = MultimodalDataItem(
136
+ pixel_values=pixel_values,
137
+ tgt_size=tgt_sizes_flat,
138
+ modality=Modality.IMAGE,
139
+ )
140
+ items += [item]
141
+
142
+ if (
143
+ "audio_features" in res
144
+ and res["audio_features"] is not None
145
+ and len(res["audio_features"]) != 0
146
+ ):
147
+ item = MultimodalDataItem(
148
+ audio_features=[res["audio_features"]],
149
+ audio_feature_lens=res["audio_feature_lens"],
150
+ modality=Modality.AUDIO,
151
+ )
152
+ items += [item]
153
+
151
154
  return {
155
+ "mm_items": items,
152
156
  "input_ids": res["input_ids"].flatten().tolist(),
153
- "pixel_values": pixel_values,
154
- "tgt_sizes": tgt_sizes,
155
- "data_hashes": base_output.mm_data_hashes,
156
- "modalities": request_obj.modalities or ["image"],
157
157
  "audio_start_id": audio_start_id,
158
158
  "audio_end_id": audio_end_id,
159
- "audio_features": res["audio_features"],
160
- "audio_bounds": res["audio_bounds"],
161
- "audio_feature_lens": res["audio_feature_lens"],
162
159
  "im_token_id": im_token_id,
163
160
  "im_start_id": tokenizer.im_start_id,
164
161
  "im_end_id": tokenizer.im_end_id,
@@ -1,10 +1,9 @@
1
- import asyncio
2
1
  from typing import List, Union
3
2
 
4
3
  from sglang.srt.managers.multimodal_processors.base_processor import (
5
4
  BaseMultimodalProcessor,
6
- get_global_processor,
7
5
  )
6
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
8
7
  from sglang.srt.models.mllama import MllamaForConditionalGeneration
9
8
  from sglang.srt.utils import load_image
10
9
 
@@ -15,25 +14,6 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
15
14
  def __init__(self, hf_config, server_args, _processor):
16
15
  super().__init__(hf_config, server_args, _processor)
17
16
 
18
- @staticmethod
19
- def _process_single_image_task(images, input_text):
20
- # input_ids', 'attention_mask', 'pixel_values', 'aspect_ratio_ids', 'aspect_ratio_mask', 'cross_attention_mask'
21
- return get_global_processor()(images, input_text, return_tensors="pt")
22
-
23
- async def _process_single_image(self, images, input_text):
24
- if self.executor is not None:
25
- loop = asyncio.get_event_loop()
26
- image_inputs = await loop.run_in_executor(
27
- self.executor,
28
- MllamaImageProcessor._process_single_image_task,
29
- images,
30
- input_text,
31
- )
32
- else:
33
- image_inputs = self._processor(images, input_text, return_tensors="pt")
34
-
35
- return image_inputs
36
-
37
17
  async def process_mm_data_async(
38
18
  self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
39
19
  ):
@@ -52,8 +32,15 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
52
32
  else:
53
33
  images = load_image(image_data[0])[0]
54
34
 
55
- image_inputs = await self._process_single_image(images, input_text)
56
- image_inputs["data_hashes"] = [hash(str(image_data))]
35
+ image_inputs = self.process_mm_data(input_text=input_text, images=images)
57
36
  image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
37
+ image_inputs["mm_items"] = [
38
+ MultimodalDataItem(
39
+ pixel_values=image_inputs["pixel_values"],
40
+ aspect_ratio_id=image_inputs["aspect_ratio_ids"],
41
+ aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
42
+ modality=Modality.IMAGE,
43
+ )
44
+ ]
58
45
 
59
46
  return image_inputs
@@ -1,18 +1,17 @@
1
1
  import asyncio
2
2
  import math
3
- import time
4
3
  from typing import List, Union
5
4
 
6
5
  import torch
7
6
  from PIL import Image
8
7
 
9
- from sglang.srt.managers.multimodal_processor import (
8
+ from sglang.srt.managers.multimodal_processors.base_processor import (
10
9
  BaseMultimodalProcessor as SGLangBaseProcessor,
11
10
  )
12
11
  from sglang.srt.managers.multimodal_processors.base_processor import (
13
12
  MultimodalSpecialTokens,
14
- get_global_processor,
15
13
  )
14
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
16
15
  from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
17
16
  from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
18
17
 
@@ -34,45 +33,15 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
34
33
  self.MAX_PIXELS = 16384 * 28 * 28
35
34
  self.MAX_RATIO = 200
36
35
 
37
- @staticmethod
38
- def _process_images_task(images, input_text, _hf_config):
39
- if isinstance(images, list) and len(images) == 0:
40
- images = None
41
- result = get_global_processor().__call__(
42
- text=[input_text], images=images, padding=True, return_tensors="pt"
43
- )
44
-
45
- return {
46
- "input_ids": result.input_ids,
47
- "pixel_values": getattr(result, "pixel_values", None),
48
- "image_grid_thw": getattr(result, "image_grid_thw", None),
49
- "second_per_grid_ts": getattr(result, "second_per_grid_ts", None),
50
- "video_grid_thws": getattr(result, "video_grid_thws", None),
51
- }
52
-
53
- async def _process_single_image(self, images, input_text) -> dict:
54
- if self.executor is not None:
55
- loop = asyncio.get_event_loop()
56
- return await loop.run_in_executor(
57
- self.executor,
58
- Qwen2_5VLImageProcessor._process_images_task,
59
- images,
60
- input_text,
61
- self.hf_config,
62
- )
63
- else:
64
- return self._process_images_task(images, input_text, self.hf_config)
65
-
66
36
  async def process_mm_data_async(
67
37
  self,
68
38
  image_data: List[Union[str, bytes]],
69
- input_ids,
39
+ prompt,
70
40
  request_obj,
71
41
  max_req_input_len,
72
42
  *args,
73
43
  **kwargs,
74
44
  ):
75
- start = time.time()
76
45
  if not image_data:
77
46
  return None
78
47
  if isinstance(image_data, str):
@@ -80,7 +49,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
80
49
 
81
50
  image_token = self.IMAGE_TOKEN
82
51
  base_output = self.load_mm_data(
83
- input_ids=input_ids,
52
+ prompt=prompt,
84
53
  image_data=image_data,
85
54
  multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
86
55
  max_req_input_len=max_req_input_len,
@@ -144,24 +113,32 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
144
113
  """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
145
114
  return math.floor(number / factor) * factor
146
115
 
147
- images = [resize_image(image) for image in base_output.images]
116
+ async def resize_image_async(image):
117
+ return resize_image(image)
148
118
 
149
- ret = await self._process_single_image(
150
- images=images, input_text=base_output.input_text
119
+ resize_tasks = [resize_image_async(image) for image in base_output.images]
120
+ resized_images = await asyncio.gather(*resize_tasks)
121
+
122
+ ret = self.process_mm_data(
123
+ input_text=base_output.input_text,
124
+ images=resized_images,
151
125
  )
152
126
 
153
127
  image_grid_thws = torch.concat([ret["image_grid_thw"]])
154
- video_grid_thws = None
155
128
  return {
156
129
  "input_ids": ret["input_ids"].flatten().tolist(),
157
- "pixel_values": ret["pixel_values"],
158
- "data_hashes": base_output.mm_data_hashes,
159
- "modalities": request_obj.modalities or ["image"],
160
- "image_grid_thws": image_grid_thws,
161
- "video_grid_thws": video_grid_thws,
130
+ "mm_items": [
131
+ MultimodalDataItem(
132
+ pixel_values=ret["pixel_values"],
133
+ image_grid_thws=image_grid_thws,
134
+ # TODO
135
+ video_grid_thws=None,
136
+ second_per_grid_ts=ret.get("second_per_grid_ts", None),
137
+ modality=Modality.IMAGE,
138
+ )
139
+ ],
162
140
  "im_start_id": self.IM_START_TOKEN_ID,
163
141
  "im_end_id": self.IM_END_TOKEN_ID,
164
142
  "im_token_id": self.image_token_id,
165
143
  "video_token_id": self.video_token_id,
166
- "second_per_grid_ts": ret["second_per_grid_ts"],
167
144
  }