sglang 0.4.0.post2__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +0 -12
- sglang/bench_one_batch.py +0 -12
- sglang/bench_serving.py +1 -0
- sglang/srt/aio_rwlock.py +100 -0
- sglang/srt/configs/model_config.py +8 -1
- sglang/srt/layers/attention/flashinfer_backend.py +49 -5
- sglang/srt/layers/linear.py +20 -2
- sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +14 -39
- sglang/srt/layers/moe/fused_moe_native.py +46 -0
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +110 -98
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +16 -48
- sglang/srt/layers/moe/topk.py +191 -0
- sglang/srt/layers/quantization/__init__.py +3 -3
- sglang/srt/layers/quantization/fp8.py +169 -32
- sglang/srt/layers/quantization/fp8_kernel.py +278 -0
- sglang/srt/layers/quantization/fp8_utils.py +90 -1
- sglang/srt/layers/torchao_utils.py +11 -15
- sglang/srt/managers/schedule_batch.py +16 -10
- sglang/srt/managers/scheduler.py +2 -2
- sglang/srt/managers/tokenizer_manager.py +86 -76
- sglang/srt/mem_cache/memory_pool.py +15 -8
- sglang/srt/model_executor/cuda_graph_runner.py +1 -1
- sglang/srt/model_executor/model_runner.py +6 -0
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_v2.py +67 -18
- sglang/srt/models/grok.py +1 -1
- sglang/srt/models/mixtral.py +2 -2
- sglang/srt/models/olmoe.py +1 -1
- sglang/srt/models/qwen2_moe.py +1 -1
- sglang/srt/models/xverse_moe.py +1 -1
- sglang/srt/openai_api/adapter.py +4 -0
- sglang/srt/server.py +1 -0
- sglang/srt/utils.py +33 -44
- sglang/test/test_block_fp8.py +341 -0
- sglang/version.py +1 -1
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/METADATA +3 -3
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/RECORD +44 -40
- sglang/srt/layers/fused_moe_patch.py +0 -133
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/LICENSE +0 -0
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/WHEEL +0 -0
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.dist-info}/top_level.txt +0 -0
sglang/srt/server.py
CHANGED
@@ -311,6 +311,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
|
|
311
311
|
ret = await tokenizer_manager.generate_request(obj, request).__anext__()
|
312
312
|
return ret
|
313
313
|
except ValueError as e:
|
314
|
+
logger.error(f"Error: {e}")
|
314
315
|
return ORJSONResponse(
|
315
316
|
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
316
317
|
)
|
sglang/srt/utils.py
CHANGED
@@ -14,6 +14,7 @@
|
|
14
14
|
"""Common utilities."""
|
15
15
|
|
16
16
|
import base64
|
17
|
+
import dataclasses
|
17
18
|
import ipaddress
|
18
19
|
import itertools
|
19
20
|
import json
|
@@ -1238,49 +1239,37 @@ def cuda_device_count_stateless() -> int:
|
|
1238
1239
|
return _cuda_device_count_stateless(os.environ.get("CUDA_VISIBLE_DEVICES", None))
|
1239
1240
|
|
1240
1241
|
|
1241
|
-
def
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1246
|
-
"""
|
1247
|
-
Determine whether to use tensor cores for attention computation.
|
1248
|
-
|
1249
|
-
Args:
|
1250
|
-
kv_cache_dtype: Data type of the KV cache
|
1251
|
-
num_attention_heads: Number of attention heads
|
1252
|
-
num_kv_heads: Number of key/value heads
|
1253
|
-
|
1254
|
-
Returns:
|
1255
|
-
bool: Whether to use tensor cores
|
1256
|
-
"""
|
1257
|
-
# Try to use environment variable first
|
1258
|
-
env_override = os.environ.get("SGLANG_FLASHINFER_USE_TENSOR_CORE")
|
1259
|
-
if env_override is not None:
|
1260
|
-
return env_override.lower() == "true"
|
1261
|
-
|
1262
|
-
# Try to use _grouped_size_compiled_for_decode_kernels if available
|
1263
|
-
# This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
|
1264
|
-
try:
|
1265
|
-
from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
|
1266
|
-
|
1267
|
-
if not _grouped_size_compiled_for_decode_kernels(
|
1268
|
-
num_attention_heads,
|
1269
|
-
num_kv_heads,
|
1270
|
-
):
|
1271
|
-
return True
|
1242
|
+
def dataclass_to_string_truncated(data, max_length=2048):
|
1243
|
+
if isinstance(data, str):
|
1244
|
+
if len(data) > max_length:
|
1245
|
+
half_length = max_length // 2
|
1246
|
+
return f'"{data[:half_length]} ... {data[-half_length:]}"'
|
1272
1247
|
else:
|
1273
|
-
return
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1248
|
+
return f'"{data}"'
|
1249
|
+
elif isinstance(data, (list, tuple)):
|
1250
|
+
if len(data) > max_length:
|
1251
|
+
half_length = max_length // 2
|
1252
|
+
return str(data[:half_length]) + " ... " + str(data[-half_length:])
|
1253
|
+
else:
|
1254
|
+
return str(data)
|
1255
|
+
elif isinstance(data, dict):
|
1256
|
+
return (
|
1257
|
+
"{"
|
1258
|
+
+ ", ".join(
|
1259
|
+
f"{k}: {dataclass_to_string_truncated(v, max_length)}"
|
1260
|
+
for k, v in data.items()
|
1261
|
+
)
|
1262
|
+
+ "}"
|
1263
|
+
)
|
1264
|
+
elif dataclasses.is_dataclass(data):
|
1265
|
+
fields = dataclasses.fields(data)
|
1266
|
+
return (
|
1267
|
+
f"{data.__class__.__name__}("
|
1268
|
+
+ ", ".join(
|
1269
|
+
f"{f.name}={dataclass_to_string_truncated(getattr(data, f.name), max_length)}"
|
1270
|
+
for f in fields
|
1271
|
+
)
|
1272
|
+
+ ")"
|
1273
|
+
)
|
1285
1274
|
else:
|
1286
|
-
return
|
1275
|
+
return str(data)
|
@@ -0,0 +1,341 @@
|
|
1
|
+
import itertools
|
2
|
+
import unittest
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.layers.activation import SiluAndMul
|
7
|
+
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
|
8
|
+
from sglang.srt.layers.quantization.fp8_kernel import (
|
9
|
+
per_token_group_quant_fp8,
|
10
|
+
w8a8_block_fp8_matmul,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
# For test
|
15
|
+
def native_per_token_group_quant_fp8(
|
16
|
+
x, group_size, eps=1e-10, dtype=torch.float8_e4m3fn
|
17
|
+
):
|
18
|
+
"""Function to perform per-token-group quantization on an input tensor `x` using native torch.
|
19
|
+
|
20
|
+
It converts the tensor values into float8 values and returns the
|
21
|
+
quantized tensor along with the scaling factor used for quantization.
|
22
|
+
Note that only `torch.float8_e4m3fn` is supported for now.
|
23
|
+
"""
|
24
|
+
assert (
|
25
|
+
x.shape[-1] % group_size == 0
|
26
|
+
), "the last dimension of `x` cannot be divisible by `group_size`"
|
27
|
+
assert x.is_contiguous(), "`x` is not contiguous"
|
28
|
+
|
29
|
+
finfo = torch.finfo(dtype)
|
30
|
+
fp8_min = finfo.min
|
31
|
+
fp8_max = finfo.max
|
32
|
+
|
33
|
+
x_ = x.reshape(x.numel() // group_size, group_size)
|
34
|
+
amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
|
35
|
+
x_s = amax / fp8_max
|
36
|
+
x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
|
37
|
+
x_q = x_q.reshape(x.shape)
|
38
|
+
x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
|
39
|
+
|
40
|
+
return x_q, x_s
|
41
|
+
|
42
|
+
|
43
|
+
class TestPerTokenGroupQuantFP8(unittest.TestCase):
|
44
|
+
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
45
|
+
NUM_TOKENS = [7, 83, 2048]
|
46
|
+
D = [512, 4096, 5120, 13824]
|
47
|
+
GROUP_SIZE = [64, 128, 256, 512]
|
48
|
+
SEEDS = [0]
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
def setUpClass(cls):
|
52
|
+
if not torch.cuda.is_available():
|
53
|
+
raise unittest.SkipTest("CUDA is not available")
|
54
|
+
torch.set_default_device("cuda")
|
55
|
+
|
56
|
+
def _per_token_group_quant_fp8(self, num_tokens, d, dtype, group_size, seed):
|
57
|
+
torch.manual_seed(seed)
|
58
|
+
|
59
|
+
x = torch.rand(num_tokens, d, dtype=dtype)
|
60
|
+
|
61
|
+
with torch.inference_mode():
|
62
|
+
ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
|
63
|
+
out, scale = per_token_group_quant_fp8(x, group_size)
|
64
|
+
|
65
|
+
self.assertTrue(
|
66
|
+
torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
|
67
|
+
)
|
68
|
+
self.assertTrue(torch.allclose(scale, ref_scale))
|
69
|
+
|
70
|
+
def test_per_token_group_quant_fp8(self):
|
71
|
+
for params in itertools.product(
|
72
|
+
self.NUM_TOKENS,
|
73
|
+
self.D,
|
74
|
+
self.DTYPES,
|
75
|
+
self.GROUP_SIZE,
|
76
|
+
self.SEEDS,
|
77
|
+
):
|
78
|
+
with self.subTest(
|
79
|
+
num_tokens=params[0],
|
80
|
+
d=params[1],
|
81
|
+
dtype=params[2],
|
82
|
+
group_size=params[3],
|
83
|
+
seed=params[4],
|
84
|
+
):
|
85
|
+
self._per_token_group_quant_fp8(*params)
|
86
|
+
|
87
|
+
|
88
|
+
# For test
|
89
|
+
def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
|
90
|
+
"""This function performs matrix multiplication with block-wise quantization using native torch.
|
91
|
+
|
92
|
+
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
|
93
|
+
The output is returned in the specified `output_dtype`.
|
94
|
+
"""
|
95
|
+
|
96
|
+
A = A.to(torch.float32)
|
97
|
+
B = B.to(torch.float32)
|
98
|
+
assert A.shape[-1] == B.shape[-1]
|
99
|
+
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
100
|
+
assert len(block_size) == 2
|
101
|
+
block_n, block_k = block_size[0], block_size[1]
|
102
|
+
assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
|
103
|
+
assert A.shape[:-1] == As.shape[:-1]
|
104
|
+
|
105
|
+
M = A.numel() // A.shape[-1]
|
106
|
+
N, K = B.shape
|
107
|
+
origin_C_shape = A.shape[:-1] + (N,)
|
108
|
+
A = A.reshape(M, A.shape[-1])
|
109
|
+
As = As.reshape(M, As.shape[-1])
|
110
|
+
n_tiles = (N + block_n - 1) // block_n
|
111
|
+
k_tiles = (K + block_k - 1) // block_k
|
112
|
+
assert n_tiles == Bs.shape[0]
|
113
|
+
assert k_tiles == Bs.shape[1]
|
114
|
+
|
115
|
+
C_shape = (M, N)
|
116
|
+
C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
|
117
|
+
|
118
|
+
A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
|
119
|
+
B_tiles = [
|
120
|
+
[
|
121
|
+
B[
|
122
|
+
j * block_n : min((j + 1) * block_n, N),
|
123
|
+
i * block_k : min((i + 1) * block_k, K),
|
124
|
+
]
|
125
|
+
for i in range(k_tiles)
|
126
|
+
]
|
127
|
+
for j in range(n_tiles)
|
128
|
+
]
|
129
|
+
C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
|
130
|
+
As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
|
131
|
+
|
132
|
+
for i in range(k_tiles):
|
133
|
+
for j in range(n_tiles):
|
134
|
+
a = A_tiles[i]
|
135
|
+
b = B_tiles[j][i]
|
136
|
+
c = C_tiles[j]
|
137
|
+
s = As_tiles[i] * Bs[j][i]
|
138
|
+
c[:, :] += torch.matmul(a, b.t()) * s
|
139
|
+
|
140
|
+
C = C.reshape(origin_C_shape).to(output_dtype)
|
141
|
+
return C
|
142
|
+
|
143
|
+
|
144
|
+
class TestW8A8BlockFP8Matmul(unittest.TestCase):
|
145
|
+
OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
146
|
+
M = [1, 7, 83, 512, 2048]
|
147
|
+
N = [128, 512, 1024, 4096, 7748, 13824]
|
148
|
+
K = [256, 4096, 5120, 3884, 13824]
|
149
|
+
# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
|
150
|
+
BLOCK_SIZE = [[128, 128]]
|
151
|
+
SEEDS = [0]
|
152
|
+
|
153
|
+
@classmethod
|
154
|
+
def setUpClass(cls):
|
155
|
+
if not torch.cuda.is_available():
|
156
|
+
raise unittest.SkipTest("CUDA is not available")
|
157
|
+
torch.set_default_device("cuda")
|
158
|
+
|
159
|
+
def _w8a8_block_fp8_matmul(self, M, N, K, block_size, out_dtype, seed):
|
160
|
+
torch.manual_seed(seed)
|
161
|
+
# NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
|
162
|
+
factor_for_scale = 1e-2
|
163
|
+
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
164
|
+
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
165
|
+
|
166
|
+
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
167
|
+
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
168
|
+
|
169
|
+
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
170
|
+
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
171
|
+
|
172
|
+
block_n, block_k = block_size[0], block_size[1]
|
173
|
+
n_tiles = (N + block_n - 1) // block_n
|
174
|
+
k_tiles = (K + block_k - 1) // block_k
|
175
|
+
|
176
|
+
As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
|
177
|
+
Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
|
178
|
+
|
179
|
+
with torch.inference_mode():
|
180
|
+
ref_out = native_w8a8_block_fp8_matmul(
|
181
|
+
A_fp8, B_fp8, As, Bs, block_size, out_dtype
|
182
|
+
)
|
183
|
+
out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
|
184
|
+
|
185
|
+
self.assertTrue(
|
186
|
+
torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
|
187
|
+
/ torch.mean(torch.abs(ref_out.to(torch.float32)))
|
188
|
+
< 0.001
|
189
|
+
)
|
190
|
+
|
191
|
+
def test_w8a8_block_fp8_matmul(self):
|
192
|
+
for params in itertools.product(
|
193
|
+
self.M,
|
194
|
+
self.N,
|
195
|
+
self.K,
|
196
|
+
self.BLOCK_SIZE,
|
197
|
+
self.OUT_DTYPES,
|
198
|
+
self.SEEDS,
|
199
|
+
):
|
200
|
+
with self.subTest(
|
201
|
+
M=params[0],
|
202
|
+
N=params[1],
|
203
|
+
K=params[2],
|
204
|
+
block_size=params[3],
|
205
|
+
out_dtype=params[4],
|
206
|
+
seed=params[5],
|
207
|
+
):
|
208
|
+
self._w8a8_block_fp8_matmul(*params)
|
209
|
+
|
210
|
+
|
211
|
+
# For test
|
212
|
+
def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
|
213
|
+
"""This function performs fused moe with block-wise quantization using native torch."""
|
214
|
+
|
215
|
+
B, D = a.shape
|
216
|
+
a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
|
217
|
+
out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
|
218
|
+
score = torch.softmax(score, dim=-1, dtype=torch.float32)
|
219
|
+
topk_weight, topk_ids = torch.topk(score, topk)
|
220
|
+
topk_weight = topk_weight.view(-1)
|
221
|
+
topk_ids = topk_ids.view(-1)
|
222
|
+
|
223
|
+
_, block_k = block_shape[0], block_shape[1]
|
224
|
+
a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
|
225
|
+
# NOTE(HandH1998): Since "index_cuda" not implemented for 'Float8_e4m3fn', we need to cast `float8`` to `float32``.
|
226
|
+
a_q = a_q.to(torch.float32)
|
227
|
+
for i in range(w1.shape[0]):
|
228
|
+
mask = topk_ids == i
|
229
|
+
if mask.sum():
|
230
|
+
inter_out = native_w8a8_block_fp8_matmul(
|
231
|
+
a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
|
232
|
+
)
|
233
|
+
act_out = SiluAndMul().forward_native(inter_out)
|
234
|
+
act_out_q, act_out_s = native_per_token_group_quant_fp8(act_out, block_k)
|
235
|
+
act_out = act_out.to(torch.float32)
|
236
|
+
out[mask] = native_w8a8_block_fp8_matmul(
|
237
|
+
act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
|
238
|
+
)
|
239
|
+
return (
|
240
|
+
out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
|
241
|
+
).sum(dim=1)
|
242
|
+
|
243
|
+
|
244
|
+
class TestW8A8BlockFP8FusedMoE(unittest.TestCase):
|
245
|
+
DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
246
|
+
M = [1, 33, 64, 222, 1024 * 128]
|
247
|
+
N = [128, 1024, 2048]
|
248
|
+
K = [256, 4096, 5120]
|
249
|
+
E = [8, 24]
|
250
|
+
TOP_KS = [2, 6]
|
251
|
+
BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
|
252
|
+
# BLOCK_SIZE = [[128, 128]]
|
253
|
+
SEEDS = [0]
|
254
|
+
|
255
|
+
@classmethod
|
256
|
+
def setUpClass(cls):
|
257
|
+
if not torch.cuda.is_available():
|
258
|
+
raise unittest.SkipTest("CUDA is not available")
|
259
|
+
torch.set_default_device("cuda")
|
260
|
+
|
261
|
+
def _w8a8_block_fp8_fused_moe(self, M, N, K, E, topk, block_size, dtype, seed):
|
262
|
+
torch.manual_seed(seed)
|
263
|
+
# NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
|
264
|
+
factor_for_scale = 1e-2
|
265
|
+
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
266
|
+
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
267
|
+
|
268
|
+
a = torch.randn((M, K), dtype=dtype) / 10
|
269
|
+
|
270
|
+
w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
|
271
|
+
w1 = w1_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
272
|
+
|
273
|
+
w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * fp8_max
|
274
|
+
w2 = w2_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
275
|
+
|
276
|
+
block_n, block_k = block_size[0], block_size[1]
|
277
|
+
n_tiles_w1 = (2 * N + block_n - 1) // block_n
|
278
|
+
n_tiles_w2 = (K + block_n - 1) // block_n
|
279
|
+
k_tiles_w1 = (K + block_k - 1) // block_k
|
280
|
+
k_tiles_w2 = (N + block_k - 1) // block_k
|
281
|
+
|
282
|
+
w1_s = (
|
283
|
+
torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
|
284
|
+
* factor_for_scale
|
285
|
+
)
|
286
|
+
w2_s = (
|
287
|
+
torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
|
288
|
+
* factor_for_scale
|
289
|
+
)
|
290
|
+
|
291
|
+
score = torch.randn((M, E), dtype=dtype)
|
292
|
+
|
293
|
+
with torch.inference_mode():
|
294
|
+
out = fused_moe(
|
295
|
+
a,
|
296
|
+
w1,
|
297
|
+
w2,
|
298
|
+
score,
|
299
|
+
topk,
|
300
|
+
renormalize=False,
|
301
|
+
use_fp8_w8a8=True,
|
302
|
+
w1_scale=w1_s,
|
303
|
+
w2_scale=w2_s,
|
304
|
+
block_shape=block_size,
|
305
|
+
)
|
306
|
+
ref_out = torch_w8a8_block_fp8_moe(
|
307
|
+
a, w1, w2, w1_s, w2_s, score, topk, block_size
|
308
|
+
)
|
309
|
+
|
310
|
+
self.assertTrue(
|
311
|
+
torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
|
312
|
+
/ torch.mean(torch.abs(ref_out.to(torch.float32)))
|
313
|
+
< 0.02
|
314
|
+
)
|
315
|
+
|
316
|
+
def test_w8a8_block_fp8_fused_moe(self):
|
317
|
+
for params in itertools.product(
|
318
|
+
self.M,
|
319
|
+
self.N,
|
320
|
+
self.K,
|
321
|
+
self.E,
|
322
|
+
self.TOP_KS,
|
323
|
+
self.BLOCK_SIZE,
|
324
|
+
self.DTYPES,
|
325
|
+
self.SEEDS,
|
326
|
+
):
|
327
|
+
with self.subTest(
|
328
|
+
M=params[0],
|
329
|
+
N=params[1],
|
330
|
+
K=params[2],
|
331
|
+
E=params[3],
|
332
|
+
topk=params[4],
|
333
|
+
block_size=params[5],
|
334
|
+
dtype=params[6],
|
335
|
+
seed=params[7],
|
336
|
+
):
|
337
|
+
self._w8a8_block_fp8_fused_moe(*params)
|
338
|
+
|
339
|
+
|
340
|
+
if __name__ == "__main__":
|
341
|
+
unittest.main(verbosity=2)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -234,7 +234,6 @@ Requires-Dist: pydantic; extra == "runtime-common"
|
|
234
234
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
235
235
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
236
236
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
237
|
-
Requires-Dist: gemlite; extra == "runtime-common"
|
238
237
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
239
238
|
Requires-Dist: uvloop; extra == "runtime-common"
|
240
239
|
Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
|
@@ -244,6 +243,7 @@ Requires-Dist: torch; extra == "srt"
|
|
244
243
|
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
245
244
|
Requires-Dist: cuda-python; extra == "srt"
|
246
245
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
246
|
+
Requires-Dist: sgl-kernel>=0.0.2.post8; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
249
249
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -358,7 +358,7 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
358
358
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
359
359
|
|
360
360
|
## Adoption and Sponsorship
|
361
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI
|
361
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI and DataCrunch.
|
362
362
|
|
363
363
|
## Acknowledgment and Citation
|
364
364
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|