sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. sglang/bench_one_batch.py +2 -1
  2. sglang/eval/loogle_eval.py +7 -0
  3. sglang/srt/configs/deepseekvl2.py +11 -2
  4. sglang/srt/configs/internvl.py +3 -0
  5. sglang/srt/configs/janus_pro.py +3 -0
  6. sglang/srt/configs/model_config.py +9 -7
  7. sglang/srt/configs/update_config.py +3 -1
  8. sglang/srt/conversation.py +1 -0
  9. sglang/srt/custom_op.py +5 -2
  10. sglang/srt/disaggregation/decode.py +9 -1
  11. sglang/srt/disaggregation/mooncake/conn.py +44 -56
  12. sglang/srt/distributed/parallel_state.py +33 -0
  13. sglang/srt/entrypoints/engine.py +30 -26
  14. sglang/srt/entrypoints/openai/serving_chat.py +21 -2
  15. sglang/srt/eplb/expert_location_dispatch.py +1 -1
  16. sglang/srt/function_call/function_call_parser.py +2 -0
  17. sglang/srt/function_call/qwen3_detector.py +150 -0
  18. sglang/srt/hf_transformers_utils.py +0 -1
  19. sglang/srt/layers/activation.py +13 -0
  20. sglang/srt/layers/attention/flashattention_backend.py +3 -3
  21. sglang/srt/layers/attention/flashinfer_backend.py +40 -1
  22. sglang/srt/layers/linear.py +13 -102
  23. sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
  24. sglang/srt/layers/moe/ep_moe/layer.py +23 -402
  25. sglang/srt/layers/moe/fused_moe_native.py +7 -47
  26. sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
  27. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  30. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  31. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  32. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
  33. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
  34. sglang/srt/layers/moe/topk.py +187 -12
  35. sglang/srt/layers/quantization/__init__.py +20 -134
  36. sglang/srt/layers/quantization/awq.py +578 -11
  37. sglang/srt/layers/quantization/awq_triton.py +339 -0
  38. sglang/srt/layers/quantization/base_config.py +85 -10
  39. sglang/srt/layers/quantization/blockwise_int8.py +17 -55
  40. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
  41. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
  42. sglang/srt/layers/quantization/fp8.py +273 -62
  43. sglang/srt/layers/quantization/fp8_kernel.py +210 -46
  44. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  45. sglang/srt/layers/quantization/gptq.py +501 -143
  46. sglang/srt/layers/quantization/marlin_utils.py +790 -0
  47. sglang/srt/layers/quantization/modelopt_quant.py +26 -108
  48. sglang/srt/layers/quantization/moe_wna16.py +45 -49
  49. sglang/srt/layers/quantization/petit.py +252 -0
  50. sglang/srt/layers/quantization/petit_utils.py +104 -0
  51. sglang/srt/layers/quantization/qoq.py +7 -6
  52. sglang/srt/layers/quantization/scalar_type.py +352 -0
  53. sglang/srt/layers/quantization/unquant.py +422 -0
  54. sglang/srt/layers/quantization/utils.py +343 -3
  55. sglang/srt/layers/quantization/w4afp8.py +8 -4
  56. sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
  57. sglang/srt/layers/quantization/w8a8_int8.py +51 -115
  58. sglang/srt/layers/vocab_parallel_embedding.py +1 -41
  59. sglang/srt/lora/lora.py +0 -4
  60. sglang/srt/lora/lora_manager.py +87 -53
  61. sglang/srt/lora/mem_pool.py +81 -33
  62. sglang/srt/lora/utils.py +12 -5
  63. sglang/srt/managers/cache_controller.py +241 -0
  64. sglang/srt/managers/io_struct.py +41 -29
  65. sglang/srt/managers/mm_utils.py +7 -8
  66. sglang/srt/managers/schedule_batch.py +150 -110
  67. sglang/srt/managers/schedule_policy.py +68 -27
  68. sglang/srt/managers/scheduler.py +243 -61
  69. sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
  70. sglang/srt/managers/tokenizer_manager.py +11 -3
  71. sglang/srt/managers/tp_worker.py +14 -0
  72. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  73. sglang/srt/mem_cache/allocator.py +7 -16
  74. sglang/srt/mem_cache/base_prefix_cache.py +14 -2
  75. sglang/srt/mem_cache/chunk_cache.py +5 -2
  76. sglang/srt/mem_cache/hicache_storage.py +152 -0
  77. sglang/srt/mem_cache/hiradix_cache.py +179 -4
  78. sglang/srt/mem_cache/memory_pool.py +16 -1
  79. sglang/srt/mem_cache/memory_pool_host.py +41 -2
  80. sglang/srt/mem_cache/radix_cache.py +26 -0
  81. sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
  82. sglang/srt/metrics/collector.py +9 -0
  83. sglang/srt/model_executor/cuda_graph_runner.py +5 -6
  84. sglang/srt/model_executor/forward_batch_info.py +14 -1
  85. sglang/srt/model_executor/model_runner.py +109 -22
  86. sglang/srt/model_loader/loader.py +7 -1
  87. sglang/srt/model_loader/utils.py +4 -4
  88. sglang/srt/models/clip.py +1 -1
  89. sglang/srt/models/deepseek.py +9 -6
  90. sglang/srt/models/deepseek_janus_pro.py +1 -1
  91. sglang/srt/models/deepseek_v2.py +191 -171
  92. sglang/srt/models/deepseek_vl2.py +5 -5
  93. sglang/srt/models/gemma.py +48 -0
  94. sglang/srt/models/gemma2.py +52 -0
  95. sglang/srt/models/gemma3_causal.py +63 -0
  96. sglang/srt/models/gemma3_mm.py +1 -1
  97. sglang/srt/models/gemma3n_mm.py +2 -4
  98. sglang/srt/models/granitemoe.py +385 -0
  99. sglang/srt/models/grok.py +9 -3
  100. sglang/srt/models/hunyuan.py +63 -16
  101. sglang/srt/models/internvl.py +1 -1
  102. sglang/srt/models/kimi_vl.py +1 -1
  103. sglang/srt/models/llama.py +41 -0
  104. sglang/srt/models/llama4.py +11 -11
  105. sglang/srt/models/llava.py +2 -2
  106. sglang/srt/models/llavavid.py +1 -1
  107. sglang/srt/models/minicpm.py +0 -2
  108. sglang/srt/models/minicpmo.py +3 -7
  109. sglang/srt/models/minicpmv.py +1 -1
  110. sglang/srt/models/mistral.py +1 -1
  111. sglang/srt/models/mixtral.py +9 -2
  112. sglang/srt/models/mllama.py +3 -5
  113. sglang/srt/models/mllama4.py +3 -3
  114. sglang/srt/models/olmoe.py +8 -5
  115. sglang/srt/models/persimmon.py +330 -0
  116. sglang/srt/models/phi.py +321 -0
  117. sglang/srt/models/phi4mm.py +44 -4
  118. sglang/srt/models/phi4mm_audio.py +1260 -0
  119. sglang/srt/models/phi4mm_utils.py +1917 -0
  120. sglang/srt/models/phimoe.py +9 -3
  121. sglang/srt/models/qwen.py +37 -0
  122. sglang/srt/models/qwen2.py +41 -0
  123. sglang/srt/models/qwen2_5_vl.py +4 -4
  124. sglang/srt/models/qwen2_audio.py +1 -1
  125. sglang/srt/models/qwen2_moe.py +53 -5
  126. sglang/srt/models/qwen2_vl.py +4 -4
  127. sglang/srt/models/qwen3.py +65 -1
  128. sglang/srt/models/qwen3_moe.py +56 -18
  129. sglang/srt/models/vila.py +1 -1
  130. sglang/srt/multimodal/processors/base_processor.py +91 -97
  131. sglang/srt/multimodal/processors/clip.py +21 -19
  132. sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
  133. sglang/srt/multimodal/processors/gemma3.py +13 -17
  134. sglang/srt/multimodal/processors/gemma3n.py +19 -23
  135. sglang/srt/multimodal/processors/internvl.py +9 -10
  136. sglang/srt/multimodal/processors/janus_pro.py +12 -27
  137. sglang/srt/multimodal/processors/kimi_vl.py +12 -14
  138. sglang/srt/multimodal/processors/llava.py +4 -2
  139. sglang/srt/multimodal/processors/minicpm.py +35 -44
  140. sglang/srt/multimodal/processors/mlama.py +21 -18
  141. sglang/srt/multimodal/processors/mllama4.py +4 -5
  142. sglang/srt/multimodal/processors/phi4mm.py +63 -39
  143. sglang/srt/multimodal/processors/pixtral.py +14 -35
  144. sglang/srt/multimodal/processors/qwen_audio.py +65 -0
  145. sglang/srt/multimodal/processors/qwen_vl.py +16 -21
  146. sglang/srt/multimodal/processors/vila.py +14 -14
  147. sglang/srt/sampling/sampling_params.py +8 -1
  148. sglang/srt/server_args.py +393 -230
  149. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
  150. sglang/srt/two_batch_overlap.py +1 -0
  151. sglang/srt/utils.py +27 -1
  152. sglang/test/runners.py +14 -3
  153. sglang/test/test_block_fp8.py +8 -3
  154. sglang/test/test_block_fp8_ep.py +1 -1
  155. sglang/test/test_custom_ops.py +12 -7
  156. sglang/test/test_cutlass_w4a8_moe.py +1 -3
  157. sglang/test/test_fp4_moe.py +1 -3
  158. sglang/test/test_marlin_moe.py +286 -0
  159. sglang/test/test_marlin_utils.py +171 -0
  160. sglang/test/test_utils.py +35 -0
  161. sglang/version.py +1 -1
  162. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
  163. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
  164. sglang/srt/layers/quantization/quant_utils.py +0 -166
  165. sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
  166. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
  167. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
  168. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0
@@ -1,166 +0,0 @@
1
- # SPDX-License-Identifier: Apache-2.0
2
- # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
3
-
4
- from typing import Optional
5
-
6
- import numpy
7
- import torch
8
- from sgl_kernel.scalar_type import ScalarType
9
-
10
-
11
- def get_pack_factor(num_bits):
12
- assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
13
- return 32 // num_bits
14
-
15
-
16
- def pack_cols(
17
- q_w: torch.Tensor,
18
- num_bits: int,
19
- size_k: int,
20
- size_n: int,
21
- ):
22
- assert q_w.shape == (size_k, size_n)
23
-
24
- pack_factor = get_pack_factor(num_bits)
25
- assert size_n % pack_factor == 0
26
-
27
- orig_device = q_w.device
28
-
29
- q_w = q_w.cpu().numpy().astype(numpy.uint32)
30
-
31
- q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
32
-
33
- for i in range(pack_factor):
34
- q_res |= q_w[:, i::pack_factor] << num_bits * i
35
-
36
- q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
37
- q_res = q_res.contiguous()
38
-
39
- return q_res
40
-
41
-
42
- def unpack_cols(
43
- packed_q_w: torch.Tensor,
44
- num_bits: int,
45
- size_k: int,
46
- size_n: int,
47
- ):
48
- pack_factor = get_pack_factor(num_bits)
49
- assert size_n % pack_factor == 0
50
- assert packed_q_w.shape == (
51
- size_k,
52
- size_n // pack_factor,
53
- ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
54
- packed_q_w.shape, size_k, size_n, pack_factor
55
- )
56
-
57
- orig_device = packed_q_w.device
58
-
59
- packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
60
- q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
61
-
62
- mask = (1 << num_bits) - 1
63
- for i in range(pack_factor):
64
- vals = packed_q_w_cpu & mask
65
- packed_q_w_cpu >>= num_bits
66
- q_res[:, i::pack_factor] = vals
67
-
68
- q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
69
- q_res = q_res.contiguous()
70
-
71
- return q_res
72
-
73
-
74
- def quantize_weights(
75
- w: torch.Tensor,
76
- quant_type: ScalarType,
77
- group_size: Optional[int],
78
- zero_points: bool = False,
79
- ref_zero_points_after_scales: bool = False,
80
- ):
81
- assert (
82
- quant_type.is_integer()
83
- ), "Floating point quantization may work but has not been tested"
84
- assert not zero_points or group_size is not None, (
85
- "to have group zero points, group_size must be provided "
86
- "(-1 group_size is channelwise)"
87
- )
88
-
89
- orig_device = w.device
90
- orig_type = w.dtype
91
- size_k, size_n = w.shape
92
-
93
- assert w.is_floating_point(), "w must be float"
94
-
95
- if group_size == -1:
96
- group_size = size_k
97
-
98
- # Reshape to [groupsize, -1]
99
- if group_size is not None and group_size < size_k:
100
- w = w.reshape((-1, group_size, size_n))
101
- w = w.permute(1, 0, 2)
102
- w = w.reshape((group_size, -1))
103
-
104
- # Compute scale for each group
105
- max_val = torch.max(w, 0, keepdim=True).values
106
- min_val = torch.min(w, 0, keepdim=True).values
107
-
108
- max_q_val = quant_type.max()
109
- min_q_val = quant_type.min()
110
-
111
- w_s = torch.Tensor([1.0]).to(w.device) # unscaled case
112
- maybe_w_zp = None
113
- if group_size is not None:
114
- if zero_points:
115
- assert not quant_type.is_signed() and quant_type.max() > 0
116
- w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
117
- maybe_w_zp = (
118
- torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
119
- )
120
- else:
121
- # If the bias is such that there are no possible negative/positive
122
- # values, set the max value to inf to avoid divide by 0
123
- w_s = torch.max(
124
- abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
125
- abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
126
- )
127
-
128
- # Quantize
129
- w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
130
- w_q = torch.clamp(w_q, min_q_val, max_q_val)
131
-
132
- # Compute ref (dequantized)
133
- # For some kernels (namely Machete) the zero-points are applied after the
134
- # scales are applied, for this case computing the reference in similar way
135
- # allows us to use tighter error tolerances in our unit tests.
136
- if ref_zero_points_after_scales and maybe_w_zp is not None:
137
- w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
138
- else:
139
- w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
140
-
141
- if quant_type.has_bias():
142
- w_q += quant_type.bias
143
-
144
- # Restore original shapes
145
- if group_size is not None and group_size < size_k:
146
-
147
- def reshape_w(w):
148
- w = w.reshape((group_size, -1, size_n))
149
- w = w.permute(1, 0, 2)
150
- w = w.reshape((size_k, size_n)).contiguous()
151
- return w
152
-
153
- w_q = reshape_w(w_q)
154
- w_ref = reshape_w(w_ref)
155
- w_s = w_s.reshape((-1, size_n)).contiguous()
156
-
157
- if maybe_w_zp is not None:
158
- maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
159
- maybe_w_zp = maybe_w_zp.to(device=orig_device)
160
-
161
- return (
162
- w_ref.to(device=orig_device),
163
- w_q.to(device=orig_device),
164
- w_s if group_size is not None else None,
165
- maybe_w_zp,
166
- )
@@ -1,94 +0,0 @@
1
- import re
2
- from typing import List, Union
3
-
4
- import torch
5
-
6
- from sglang.srt.managers.multimodal_processors.base_processor import (
7
- BaseMultimodalProcessor,
8
- MultimodalSpecialTokens,
9
- )
10
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
- from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
12
-
13
-
14
- class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
15
- models = [Qwen2AudioForConditionalGeneration]
16
-
17
- def __init__(self, hf_config, server_args, _processor):
18
- super().__init__(hf_config, server_args, _processor)
19
- self.AUDIO_TOKEN = "<|audio_bos|><|AUDIO|><|audio_eos|>"
20
- self.AUDIO_TOKEN_REGEX = re.compile(
21
- r"<\|audio_bos\|>(?:<\|AUDIO\|>)+<\|audio_eos\|>"
22
- )
23
-
24
- async def process_mm_data_async(
25
- self,
26
- image_data: List[Union[str, bytes]],
27
- input_text,
28
- request_obj,
29
- max_req_input_len,
30
- **kwargs,
31
- ):
32
- audio_data = request_obj.audio_data
33
- if not isinstance(audio_data, list):
34
- audio_data = [audio_data]
35
-
36
- base_output = self.load_mm_data(
37
- prompt=input_text,
38
- max_req_input_len=max_req_input_len,
39
- audio_data=audio_data,
40
- multimodal_tokens=MultimodalSpecialTokens(
41
- audio_token=self.AUDIO_TOKEN,
42
- audio_token_regex=self.AUDIO_TOKEN_REGEX,
43
- ),
44
- )
45
- if base_output is None:
46
- return None
47
-
48
- res = self.process_mm_data(
49
- input_text=base_output.input_text,
50
- audio=base_output.audios,
51
- )
52
-
53
- # Collect special token ids
54
- tokenizer = self._processor.tokenizer
55
- audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_bos|>")
56
- audio_token_id = tokenizer.convert_tokens_to_ids("<|AUDIO|>")
57
- audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_eos|>")
58
-
59
- items = []
60
- input_ids = res["input_ids"].flatten()
61
-
62
- if (
63
- "input_features" in res
64
- and res["input_features"] is not None
65
- and len(res["input_features"]) != 0
66
- ):
67
- if audio_start_id is not None and audio_end_id is not None:
68
- audio_offsets = self.get_mm_items_offset_by_pair(
69
- input_ids=input_ids,
70
- mm_start_id=audio_start_id,
71
- mm_end_id=audio_end_id,
72
- )
73
- else:
74
- audio_offsets = None
75
-
76
- input_lengths = res["feature_attention_mask"].sum(dim=-1)
77
- input_lengths = (input_lengths - 1) // 2 + 1
78
- output_lengths = (input_lengths - 2) // 2 + 1
79
-
80
- item = MultimodalDataItem(
81
- audio_features=res["input_features"],
82
- audio_feature_lens=output_lengths,
83
- audio_offsets=audio_offsets,
84
- modality=Modality.AUDIO,
85
- )
86
- items += [item]
87
-
88
- return {
89
- "mm_items": items,
90
- "input_ids": input_ids.tolist(),
91
- "audio_start_id": audio_start_id,
92
- "audio_token_id": audio_token_id,
93
- "audio_end_id": audio_end_id,
94
- }