sglang 0.4.0.post2__py3-none-any.whl → 0.4.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +0 -12
- sglang/bench_one_batch.py +0 -12
- sglang/bench_serving.py +11 -2
- sglang/lang/backend/openai.py +10 -0
- sglang/srt/aio_rwlock.py +100 -0
- sglang/srt/configs/model_config.py +8 -1
- sglang/srt/constrained/xgrammar_backend.py +6 -0
- sglang/srt/layers/attention/flashinfer_backend.py +49 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -14
- sglang/srt/layers/linear.py +20 -2
- sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +14 -39
- sglang/srt/layers/moe/fused_moe_native.py +46 -0
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +124 -99
- sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +16 -48
- sglang/srt/layers/moe/topk.py +205 -0
- sglang/srt/layers/quantization/__init__.py +3 -3
- sglang/srt/layers/quantization/fp8.py +169 -32
- sglang/srt/layers/quantization/fp8_kernel.py +292 -0
- sglang/srt/layers/quantization/fp8_utils.py +90 -1
- sglang/srt/layers/torchao_utils.py +11 -15
- sglang/srt/managers/schedule_batch.py +16 -10
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +13 -16
- sglang/srt/managers/tokenizer_manager.py +130 -111
- sglang/srt/mem_cache/memory_pool.py +15 -8
- sglang/srt/model_executor/cuda_graph_runner.py +1 -1
- sglang/srt/model_loader/loader.py +22 -11
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_v2.py +67 -18
- sglang/srt/models/gemma2.py +19 -0
- sglang/srt/models/grok.py +1 -1
- sglang/srt/models/llama.py +2 -2
- sglang/srt/models/mixtral.py +2 -2
- sglang/srt/models/olmoe.py +1 -1
- sglang/srt/models/qwen2_moe.py +1 -1
- sglang/srt/models/xverse_moe.py +1 -1
- sglang/srt/openai_api/adapter.py +23 -0
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/sampling_params.py +9 -2
- sglang/srt/server.py +21 -37
- sglang/srt/utils.py +33 -44
- sglang/test/test_block_fp8.py +341 -0
- sglang/version.py +1 -1
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/METADATA +4 -4
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/RECORD +52 -48
- sglang/srt/layers/fused_moe_patch.py +0 -133
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
- /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/LICENSE +0 -0
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/WHEEL +0 -0
- {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,341 @@
|
|
1
|
+
import itertools
|
2
|
+
import unittest
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.layers.activation import SiluAndMul
|
7
|
+
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
|
8
|
+
from sglang.srt.layers.quantization.fp8_kernel import (
|
9
|
+
per_token_group_quant_fp8,
|
10
|
+
w8a8_block_fp8_matmul,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
# For test
|
15
|
+
def native_per_token_group_quant_fp8(
|
16
|
+
x, group_size, eps=1e-10, dtype=torch.float8_e4m3fn
|
17
|
+
):
|
18
|
+
"""Function to perform per-token-group quantization on an input tensor `x` using native torch.
|
19
|
+
|
20
|
+
It converts the tensor values into float8 values and returns the
|
21
|
+
quantized tensor along with the scaling factor used for quantization.
|
22
|
+
Note that only `torch.float8_e4m3fn` is supported for now.
|
23
|
+
"""
|
24
|
+
assert (
|
25
|
+
x.shape[-1] % group_size == 0
|
26
|
+
), "the last dimension of `x` cannot be divisible by `group_size`"
|
27
|
+
assert x.is_contiguous(), "`x` is not contiguous"
|
28
|
+
|
29
|
+
finfo = torch.finfo(dtype)
|
30
|
+
fp8_min = finfo.min
|
31
|
+
fp8_max = finfo.max
|
32
|
+
|
33
|
+
x_ = x.reshape(x.numel() // group_size, group_size)
|
34
|
+
amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
|
35
|
+
x_s = amax / fp8_max
|
36
|
+
x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
|
37
|
+
x_q = x_q.reshape(x.shape)
|
38
|
+
x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
|
39
|
+
|
40
|
+
return x_q, x_s
|
41
|
+
|
42
|
+
|
43
|
+
class TestPerTokenGroupQuantFP8(unittest.TestCase):
|
44
|
+
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
45
|
+
NUM_TOKENS = [7, 83, 2048]
|
46
|
+
D = [512, 4096, 5120, 13824]
|
47
|
+
GROUP_SIZE = [64, 128, 256, 512]
|
48
|
+
SEEDS = [0]
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
def setUpClass(cls):
|
52
|
+
if not torch.cuda.is_available():
|
53
|
+
raise unittest.SkipTest("CUDA is not available")
|
54
|
+
torch.set_default_device("cuda")
|
55
|
+
|
56
|
+
def _per_token_group_quant_fp8(self, num_tokens, d, dtype, group_size, seed):
|
57
|
+
torch.manual_seed(seed)
|
58
|
+
|
59
|
+
x = torch.rand(num_tokens, d, dtype=dtype)
|
60
|
+
|
61
|
+
with torch.inference_mode():
|
62
|
+
ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
|
63
|
+
out, scale = per_token_group_quant_fp8(x, group_size)
|
64
|
+
|
65
|
+
self.assertTrue(
|
66
|
+
torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
|
67
|
+
)
|
68
|
+
self.assertTrue(torch.allclose(scale, ref_scale))
|
69
|
+
|
70
|
+
def test_per_token_group_quant_fp8(self):
|
71
|
+
for params in itertools.product(
|
72
|
+
self.NUM_TOKENS,
|
73
|
+
self.D,
|
74
|
+
self.DTYPES,
|
75
|
+
self.GROUP_SIZE,
|
76
|
+
self.SEEDS,
|
77
|
+
):
|
78
|
+
with self.subTest(
|
79
|
+
num_tokens=params[0],
|
80
|
+
d=params[1],
|
81
|
+
dtype=params[2],
|
82
|
+
group_size=params[3],
|
83
|
+
seed=params[4],
|
84
|
+
):
|
85
|
+
self._per_token_group_quant_fp8(*params)
|
86
|
+
|
87
|
+
|
88
|
+
# For test
|
89
|
+
def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
|
90
|
+
"""This function performs matrix multiplication with block-wise quantization using native torch.
|
91
|
+
|
92
|
+
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
|
93
|
+
The output is returned in the specified `output_dtype`.
|
94
|
+
"""
|
95
|
+
|
96
|
+
A = A.to(torch.float32)
|
97
|
+
B = B.to(torch.float32)
|
98
|
+
assert A.shape[-1] == B.shape[-1]
|
99
|
+
assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
|
100
|
+
assert len(block_size) == 2
|
101
|
+
block_n, block_k = block_size[0], block_size[1]
|
102
|
+
assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
|
103
|
+
assert A.shape[:-1] == As.shape[:-1]
|
104
|
+
|
105
|
+
M = A.numel() // A.shape[-1]
|
106
|
+
N, K = B.shape
|
107
|
+
origin_C_shape = A.shape[:-1] + (N,)
|
108
|
+
A = A.reshape(M, A.shape[-1])
|
109
|
+
As = As.reshape(M, As.shape[-1])
|
110
|
+
n_tiles = (N + block_n - 1) // block_n
|
111
|
+
k_tiles = (K + block_k - 1) // block_k
|
112
|
+
assert n_tiles == Bs.shape[0]
|
113
|
+
assert k_tiles == Bs.shape[1]
|
114
|
+
|
115
|
+
C_shape = (M, N)
|
116
|
+
C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
|
117
|
+
|
118
|
+
A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
|
119
|
+
B_tiles = [
|
120
|
+
[
|
121
|
+
B[
|
122
|
+
j * block_n : min((j + 1) * block_n, N),
|
123
|
+
i * block_k : min((i + 1) * block_k, K),
|
124
|
+
]
|
125
|
+
for i in range(k_tiles)
|
126
|
+
]
|
127
|
+
for j in range(n_tiles)
|
128
|
+
]
|
129
|
+
C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
|
130
|
+
As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
|
131
|
+
|
132
|
+
for i in range(k_tiles):
|
133
|
+
for j in range(n_tiles):
|
134
|
+
a = A_tiles[i]
|
135
|
+
b = B_tiles[j][i]
|
136
|
+
c = C_tiles[j]
|
137
|
+
s = As_tiles[i] * Bs[j][i]
|
138
|
+
c[:, :] += torch.matmul(a, b.t()) * s
|
139
|
+
|
140
|
+
C = C.reshape(origin_C_shape).to(output_dtype)
|
141
|
+
return C
|
142
|
+
|
143
|
+
|
144
|
+
class TestW8A8BlockFP8Matmul(unittest.TestCase):
|
145
|
+
OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
146
|
+
M = [1, 7, 83, 512, 2048]
|
147
|
+
N = [128, 512, 1024, 4096, 7748, 13824]
|
148
|
+
K = [256, 4096, 5120, 3884, 13824]
|
149
|
+
# BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
|
150
|
+
BLOCK_SIZE = [[128, 128]]
|
151
|
+
SEEDS = [0]
|
152
|
+
|
153
|
+
@classmethod
|
154
|
+
def setUpClass(cls):
|
155
|
+
if not torch.cuda.is_available():
|
156
|
+
raise unittest.SkipTest("CUDA is not available")
|
157
|
+
torch.set_default_device("cuda")
|
158
|
+
|
159
|
+
def _w8a8_block_fp8_matmul(self, M, N, K, block_size, out_dtype, seed):
|
160
|
+
torch.manual_seed(seed)
|
161
|
+
# NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
|
162
|
+
factor_for_scale = 1e-2
|
163
|
+
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
164
|
+
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
165
|
+
|
166
|
+
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
167
|
+
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
168
|
+
|
169
|
+
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
170
|
+
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
171
|
+
|
172
|
+
block_n, block_k = block_size[0], block_size[1]
|
173
|
+
n_tiles = (N + block_n - 1) // block_n
|
174
|
+
k_tiles = (K + block_k - 1) // block_k
|
175
|
+
|
176
|
+
As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
|
177
|
+
Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
|
178
|
+
|
179
|
+
with torch.inference_mode():
|
180
|
+
ref_out = native_w8a8_block_fp8_matmul(
|
181
|
+
A_fp8, B_fp8, As, Bs, block_size, out_dtype
|
182
|
+
)
|
183
|
+
out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
|
184
|
+
|
185
|
+
self.assertTrue(
|
186
|
+
torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
|
187
|
+
/ torch.mean(torch.abs(ref_out.to(torch.float32)))
|
188
|
+
< 0.001
|
189
|
+
)
|
190
|
+
|
191
|
+
def test_w8a8_block_fp8_matmul(self):
|
192
|
+
for params in itertools.product(
|
193
|
+
self.M,
|
194
|
+
self.N,
|
195
|
+
self.K,
|
196
|
+
self.BLOCK_SIZE,
|
197
|
+
self.OUT_DTYPES,
|
198
|
+
self.SEEDS,
|
199
|
+
):
|
200
|
+
with self.subTest(
|
201
|
+
M=params[0],
|
202
|
+
N=params[1],
|
203
|
+
K=params[2],
|
204
|
+
block_size=params[3],
|
205
|
+
out_dtype=params[4],
|
206
|
+
seed=params[5],
|
207
|
+
):
|
208
|
+
self._w8a8_block_fp8_matmul(*params)
|
209
|
+
|
210
|
+
|
211
|
+
# For test
|
212
|
+
def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
|
213
|
+
"""This function performs fused moe with block-wise quantization using native torch."""
|
214
|
+
|
215
|
+
B, D = a.shape
|
216
|
+
a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
|
217
|
+
out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
|
218
|
+
score = torch.softmax(score, dim=-1, dtype=torch.float32)
|
219
|
+
topk_weight, topk_ids = torch.topk(score, topk)
|
220
|
+
topk_weight = topk_weight.view(-1)
|
221
|
+
topk_ids = topk_ids.view(-1)
|
222
|
+
|
223
|
+
_, block_k = block_shape[0], block_shape[1]
|
224
|
+
a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
|
225
|
+
# NOTE(HandH1998): Since "index_cuda" not implemented for 'Float8_e4m3fn', we need to cast `float8`` to `float32``.
|
226
|
+
a_q = a_q.to(torch.float32)
|
227
|
+
for i in range(w1.shape[0]):
|
228
|
+
mask = topk_ids == i
|
229
|
+
if mask.sum():
|
230
|
+
inter_out = native_w8a8_block_fp8_matmul(
|
231
|
+
a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
|
232
|
+
)
|
233
|
+
act_out = SiluAndMul().forward_native(inter_out)
|
234
|
+
act_out_q, act_out_s = native_per_token_group_quant_fp8(act_out, block_k)
|
235
|
+
act_out = act_out.to(torch.float32)
|
236
|
+
out[mask] = native_w8a8_block_fp8_matmul(
|
237
|
+
act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
|
238
|
+
)
|
239
|
+
return (
|
240
|
+
out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
|
241
|
+
).sum(dim=1)
|
242
|
+
|
243
|
+
|
244
|
+
class TestW8A8BlockFP8FusedMoE(unittest.TestCase):
|
245
|
+
DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
246
|
+
M = [1, 33, 64, 222, 1024 * 128]
|
247
|
+
N = [128, 1024, 2048]
|
248
|
+
K = [256, 4096, 5120]
|
249
|
+
E = [8, 24]
|
250
|
+
TOP_KS = [2, 6]
|
251
|
+
BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
|
252
|
+
# BLOCK_SIZE = [[128, 128]]
|
253
|
+
SEEDS = [0]
|
254
|
+
|
255
|
+
@classmethod
|
256
|
+
def setUpClass(cls):
|
257
|
+
if not torch.cuda.is_available():
|
258
|
+
raise unittest.SkipTest("CUDA is not available")
|
259
|
+
torch.set_default_device("cuda")
|
260
|
+
|
261
|
+
def _w8a8_block_fp8_fused_moe(self, M, N, K, E, topk, block_size, dtype, seed):
|
262
|
+
torch.manual_seed(seed)
|
263
|
+
# NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
|
264
|
+
factor_for_scale = 1e-2
|
265
|
+
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
266
|
+
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
267
|
+
|
268
|
+
a = torch.randn((M, K), dtype=dtype) / 10
|
269
|
+
|
270
|
+
w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
|
271
|
+
w1 = w1_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
272
|
+
|
273
|
+
w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * fp8_max
|
274
|
+
w2 = w2_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
275
|
+
|
276
|
+
block_n, block_k = block_size[0], block_size[1]
|
277
|
+
n_tiles_w1 = (2 * N + block_n - 1) // block_n
|
278
|
+
n_tiles_w2 = (K + block_n - 1) // block_n
|
279
|
+
k_tiles_w1 = (K + block_k - 1) // block_k
|
280
|
+
k_tiles_w2 = (N + block_k - 1) // block_k
|
281
|
+
|
282
|
+
w1_s = (
|
283
|
+
torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
|
284
|
+
* factor_for_scale
|
285
|
+
)
|
286
|
+
w2_s = (
|
287
|
+
torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
|
288
|
+
* factor_for_scale
|
289
|
+
)
|
290
|
+
|
291
|
+
score = torch.randn((M, E), dtype=dtype)
|
292
|
+
|
293
|
+
with torch.inference_mode():
|
294
|
+
out = fused_moe(
|
295
|
+
a,
|
296
|
+
w1,
|
297
|
+
w2,
|
298
|
+
score,
|
299
|
+
topk,
|
300
|
+
renormalize=False,
|
301
|
+
use_fp8_w8a8=True,
|
302
|
+
w1_scale=w1_s,
|
303
|
+
w2_scale=w2_s,
|
304
|
+
block_shape=block_size,
|
305
|
+
)
|
306
|
+
ref_out = torch_w8a8_block_fp8_moe(
|
307
|
+
a, w1, w2, w1_s, w2_s, score, topk, block_size
|
308
|
+
)
|
309
|
+
|
310
|
+
self.assertTrue(
|
311
|
+
torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
|
312
|
+
/ torch.mean(torch.abs(ref_out.to(torch.float32)))
|
313
|
+
< 0.02
|
314
|
+
)
|
315
|
+
|
316
|
+
def test_w8a8_block_fp8_fused_moe(self):
|
317
|
+
for params in itertools.product(
|
318
|
+
self.M,
|
319
|
+
self.N,
|
320
|
+
self.K,
|
321
|
+
self.E,
|
322
|
+
self.TOP_KS,
|
323
|
+
self.BLOCK_SIZE,
|
324
|
+
self.DTYPES,
|
325
|
+
self.SEEDS,
|
326
|
+
):
|
327
|
+
with self.subTest(
|
328
|
+
M=params[0],
|
329
|
+
N=params[1],
|
330
|
+
K=params[2],
|
331
|
+
E=params[3],
|
332
|
+
topk=params[4],
|
333
|
+
block_size=params[5],
|
334
|
+
dtype=params[6],
|
335
|
+
seed=params[7],
|
336
|
+
):
|
337
|
+
self._w8a8_block_fp8_fused_moe(*params)
|
338
|
+
|
339
|
+
|
340
|
+
if __name__ == "__main__":
|
341
|
+
unittest.main(verbosity=2)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.1.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.1.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -234,7 +234,6 @@ Requires-Dist: pydantic; extra == "runtime-common"
|
|
234
234
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
235
235
|
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
236
236
|
Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
237
|
-
Requires-Dist: gemlite; extra == "runtime-common"
|
238
237
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
239
238
|
Requires-Dist: uvloop; extra == "runtime-common"
|
240
239
|
Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
|
@@ -244,6 +243,7 @@ Requires-Dist: torch; extra == "srt"
|
|
244
243
|
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
245
244
|
Requires-Dist: cuda-python; extra == "srt"
|
246
245
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
246
|
+
Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
249
249
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -358,8 +358,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
358
358
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
359
359
|
|
360
360
|
## Adoption and Sponsorship
|
361
|
-
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI
|
361
|
+
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
|
362
362
|
|
363
363
|
## Acknowledgment and Citation
|
364
364
|
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
365
|
-
Please cite
|
365
|
+
Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
@@ -1,17 +1,17 @@
|
|
1
1
|
sglang/__init__.py,sha256=b2oIdWzp5P8SzieeOs2TzJoN3Do3tfJbV8gZS_imVcs,1619
|
2
2
|
sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
|
3
3
|
sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
|
4
|
-
sglang/bench_offline_throughput.py,sha256=
|
5
|
-
sglang/bench_one_batch.py,sha256=
|
4
|
+
sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
|
5
|
+
sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
|
6
6
|
sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
|
7
|
-
sglang/bench_serving.py,sha256=
|
7
|
+
sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
|
8
8
|
sglang/check_env.py,sha256=4OqpZaEJOfBM6-vtPILto5kqDmgiZM1Koc7lK78A7CI,8427
|
9
9
|
sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
|
10
10
|
sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
|
11
11
|
sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
|
12
12
|
sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
|
13
13
|
sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
|
14
|
-
sglang/version.py,sha256=
|
14
|
+
sglang/version.py,sha256=ARioq8ApVNckeQorLPVfHZeN9mlHMLbaNgLGNbGq-ys,28
|
15
15
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
|
17
17
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -23,28 +23,29 @@ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
|
|
23
23
|
sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
|
24
24
|
sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
|
25
25
|
sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
|
26
|
-
sglang/lang/backend/openai.py,sha256=
|
26
|
+
sglang/lang/backend/openai.py,sha256=ha9a2P6T80TmSgYlyIwB1qYawWkjcOgiOptkktkqa1U,15436
|
27
27
|
sglang/lang/backend/runtime_endpoint.py,sha256=dfs-yZ1ekKmnbpZLluQHWPmMeZJKbaaZRRGYRa9eBE8,10541
|
28
28
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
29
29
|
sglang/srt/_custom_ops.py,sha256=Y4gyTDGhWz-W2Igq25Ojm8XFiyvkawW9I-79iwYvxJ0,3574
|
30
|
+
sglang/srt/aio_rwlock.py,sha256=6LYtOdeTUY3hkfa1dmYkgsaF2ttrwIF3hUWz2AZ2fqw,2970
|
30
31
|
sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21201
|
31
32
|
sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
|
32
33
|
sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
|
33
34
|
sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
|
34
|
-
sglang/srt/server.py,sha256=
|
35
|
+
sglang/srt/server.py,sha256=vDucJl6qtEK2swzPJ_wYitaJvsI4MigMagGlBlH5V54,34033
|
35
36
|
sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
|
36
|
-
sglang/srt/utils.py,sha256=
|
37
|
+
sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
|
37
38
|
sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
|
38
39
|
sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
|
39
40
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
40
41
|
sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
|
41
|
-
sglang/srt/configs/model_config.py,sha256=
|
42
|
+
sglang/srt/configs/model_config.py,sha256=vVarlLTw9Ged1PXIwRP-R8UhiG6oaezNIZhTNuF0eQc,16070
|
42
43
|
sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
|
43
44
|
sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
|
44
45
|
sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
|
45
46
|
sglang/srt/constrained/outlines_backend.py,sha256=CipNHNNXs8xtnJNVNe6FCwZUlSbIXbGmWVlZz3hUpFQ,6820
|
46
47
|
sglang/srt/constrained/outlines_jump_forward.py,sha256=iZWXeR3gNYoMubLGyFmLPO4V2YsN5DiGjD71Xk9iFaE,6418
|
47
|
-
sglang/srt/constrained/xgrammar_backend.py,sha256=
|
48
|
+
sglang/srt/constrained/xgrammar_backend.py,sha256=76oUFXeB29bfnEVWa1-rIrwQm5jhuMlzAX10HtAq1fQ,4887
|
48
49
|
sglang/srt/distributed/__init__.py,sha256=__tl9Frrf3PFrSyNYcn5i-y2rL-J4-Qn6RJwrsZ4xgc,83
|
49
50
|
sglang/srt/distributed/communication_op.py,sha256=ZoIhboZyefiAwr-1K-wF3rAFSQ4Wt-RxXpsX443Gbt4,1157
|
50
51
|
sglang/srt/distributed/parallel_state.py,sha256=HplRH5S0AWdwSdhoHYX9_UWQZlFjh2Z1LHaz68EXlpE,47555
|
@@ -60,35 +61,37 @@ sglang/srt/distributed/device_communicators/shm_broadcast.py,sha256=WVxBd1QfIgRW
|
|
60
61
|
sglang/srt/distributed/device_communicators/xpu_communicator.py,sha256=P3WKgddcfpUhBa-_5PvjYxH146ZE-N1cotTzEpPRKlY,1620
|
61
62
|
sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwxBQ,5471
|
62
63
|
sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
|
63
|
-
sglang/srt/layers/fused_moe_patch.py,sha256=DMIyrwOON7OSidKZdreL5HzMhP0AD5Ues0xdY-ADOQw,4471
|
64
64
|
sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
|
65
|
-
sglang/srt/layers/linear.py,sha256=
|
65
|
+
sglang/srt/layers/linear.py,sha256=KyRFU0VcoNuN-hnQB9QQcBN9NCpeqPtLzzufIHUpV6w,47064
|
66
66
|
sglang/srt/layers/logits_processor.py,sha256=JlOU0x8vBGIuTwHSdjR6Kly9_uzilBMv0NE_rvUx0W4,14747
|
67
67
|
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
68
68
|
sglang/srt/layers/radix_attention.py,sha256=E4cmvkcCdCtb6VyLNrCKy1D6VwHQ063oH3JQXPaRy6w,2178
|
69
69
|
sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
|
70
70
|
sglang/srt/layers/sampler.py,sha256=k4Op_HMkQfT7t9wgQwBVotfTUXEocrzRyQqEFnff1pc,5511
|
71
|
-
sglang/srt/layers/torchao_utils.py,sha256=
|
71
|
+
sglang/srt/layers/torchao_utils.py,sha256=dQVuWNXxAvOPjr2G5BBMWqC2oKcS2B52rx-fEc_elmc,3545
|
72
72
|
sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
|
73
73
|
sglang/srt/layers/attention/__init__.py,sha256=KIJhzOJWYioQE7Va4D83-V-ZUZVMZcczuNgDC3dlSRo,2583
|
74
74
|
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=RQdEKRykSLf9ilnaHmR6T7RFqh4emH_adfB3aJN2BUU,10920
|
75
|
-
sglang/srt/layers/attention/flashinfer_backend.py,sha256=
|
75
|
+
sglang/srt/layers/attention/flashinfer_backend.py,sha256=umD1E2zvMnPbbgvx2Ex5LQB6a4a41brjsks1M0gFMMU,26357
|
76
76
|
sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDCJG-uPKm0uslvkALKqPRPQ8,10509
|
77
77
|
sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
|
78
78
|
sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
|
79
79
|
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
|
80
|
-
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=
|
80
|
+
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=DWOZXSTVN5ZbcFjDjcqs-nPdUkxSwum0SVXhVKqwh2g,11688
|
81
81
|
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
|
82
|
-
sglang/srt/layers/
|
83
|
-
sglang/srt/layers/
|
84
|
-
sglang/srt/layers/ep_moe/
|
85
|
-
sglang/srt/layers/
|
86
|
-
sglang/srt/layers/
|
87
|
-
sglang/srt/layers/fused_moe_triton/
|
88
|
-
sglang/srt/layers/
|
82
|
+
sglang/srt/layers/moe/fused_moe_native.py,sha256=8q-LFZMSCGLc2_Gltp2lH0gSb4A1WOuKQW3wo3rpj5g,1601
|
83
|
+
sglang/srt/layers/moe/topk.py,sha256=JpeIl_-CNk0yyG3k5fmmNbbmR2_9bkKC23UoLOlMkjw,6954
|
84
|
+
sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
|
+
sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
|
86
|
+
sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
|
87
|
+
sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
|
88
|
+
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=zXwWUtthLa9E35EvlQ9A_mnIsQyA0_NYKsUBdJqONHo,31163
|
89
|
+
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
|
90
|
+
sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
|
89
91
|
sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
|
90
|
-
sglang/srt/layers/quantization/fp8.py,sha256=
|
91
|
-
sglang/srt/layers/quantization/
|
92
|
+
sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
|
93
|
+
sglang/srt/layers/quantization/fp8_kernel.py,sha256=eoO1enzD9jPC80id2oC3i8bt-LN6-4Ey223yOQ9yIPE,8792
|
94
|
+
sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
|
92
95
|
sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
|
93
96
|
sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
|
94
97
|
sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
|
@@ -96,44 +99,44 @@ sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqv
|
|
96
99
|
sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLohvHl25xlQpBoo,8439
|
97
100
|
sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
|
98
101
|
sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
|
99
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
100
|
-
sglang/srt/managers/schedule_policy.py,sha256=
|
101
|
-
sglang/srt/managers/scheduler.py,sha256=
|
102
|
+
sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
|
103
|
+
sglang/srt/managers/schedule_policy.py,sha256=QxjQ8-le062AMHHxool6CxkhvB4FIwhOQPzTX_JwL6U,15447
|
104
|
+
sglang/srt/managers/scheduler.py,sha256=Yh15uQFhJlku8a20-lhtIsiEHAcUmpL3BzL42kLVwiI,61637
|
102
105
|
sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
|
103
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
106
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=uKiTt__lCFXG60zQhmM_K7dU7IuedVSIQHVw3x3y5-E,31758
|
104
107
|
sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
|
105
108
|
sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
|
106
109
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
|
107
110
|
sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
|
108
111
|
sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
|
109
|
-
sglang/srt/mem_cache/memory_pool.py,sha256=
|
112
|
+
sglang/srt/mem_cache/memory_pool.py,sha256=oxk3UtiiFA3_1iIP6eFsk8HIcRI_8Z1-FE2KOWDr-YM,11366
|
110
113
|
sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fWWJHcq4,10945
|
111
114
|
sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
|
112
115
|
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
113
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
116
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
|
114
117
|
sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
|
115
118
|
sglang/srt/model_executor/model_runner.py,sha256=MLYBcYIQihu2I3PBTUghiU2mSWsDMzlKzcnX7yHa9JU,29837
|
116
119
|
sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
|
117
|
-
sglang/srt/model_loader/loader.py,sha256=
|
120
|
+
sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
|
118
121
|
sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
|
119
122
|
sglang/srt/model_loader/weight_utils.py,sha256=kQo9KPThjH3HAOCfC_tdwdrshdWuWJOVpPR0skSyaRY,24193
|
120
123
|
sglang/srt/models/baichuan.py,sha256=PzBOFcEAixakPEkQSaJwC0Xc1fu-yCsN9T0I67r8QmY,14919
|
121
124
|
sglang/srt/models/chatglm.py,sha256=DOrEhmb0s-yPId88R6nJeLOTUEtogk-vkB69qT2JdWc,12913
|
122
125
|
sglang/srt/models/commandr.py,sha256=PNXgfOZF84h-rSH0edEECUmEGW8YLb44V75Z_oDhFiA,14223
|
123
|
-
sglang/srt/models/dbrx.py,sha256=
|
124
|
-
sglang/srt/models/deepseek.py,sha256=
|
125
|
-
sglang/srt/models/deepseek_v2.py,sha256
|
126
|
+
sglang/srt/models/dbrx.py,sha256=okIpIwdr8Cfrz_thzc1F75XqCUfHhFLvZ1B6BaswKoA,14585
|
127
|
+
sglang/srt/models/deepseek.py,sha256=_cVOvR6eSEgRf6TUBpTD5uMdijDWFw4sSt4lGzl8tbg,15697
|
128
|
+
sglang/srt/models/deepseek_v2.py,sha256=-v_OJr2c3gJ0NMxQjvT3Jknz1XPGkzKx0TVR3NIiC6A,37284
|
126
129
|
sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
|
127
130
|
sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
|
128
|
-
sglang/srt/models/gemma2.py,sha256
|
131
|
+
sglang/srt/models/gemma2.py,sha256=-bFN-Te3YWAunLCrF-XFk_6fJS7gHM4Ca6h6aesXUTM,16362
|
129
132
|
sglang/srt/models/gemma2_reward.py,sha256=nJ01KfqLSJtqMLm3sG8p2mGZFK1xhhjh7I7Ccb-_Hq8,2494
|
130
133
|
sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
|
131
134
|
sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
|
132
135
|
sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,20409
|
133
|
-
sglang/srt/models/grok.py,sha256=
|
136
|
+
sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
|
134
137
|
sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
|
135
138
|
sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
|
136
|
-
sglang/srt/models/llama.py,sha256=
|
139
|
+
sglang/srt/models/llama.py,sha256=o3FYyOhkZJirzugyYz1kxs6RpY84O_uKowWWmt3jv24,19929
|
137
140
|
sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
|
138
141
|
sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
|
139
142
|
sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
|
@@ -142,27 +145,27 @@ sglang/srt/models/llavavid.py,sha256=dYUkKfHoE15vF_VXA_s_ICCTUMSmSgvP181fk8dUi0g
|
|
142
145
|
sglang/srt/models/minicpm.py,sha256=ws4AqhOfAvYHGd04QuXCZel-Oxy9_vN4p4rTjs9RSz0,13723
|
143
146
|
sglang/srt/models/minicpm3.py,sha256=YIKJDTpwjmpLlv1sNT93k2yZMvGQlI_H87czjf6QYyo,24707
|
144
147
|
sglang/srt/models/mistral.py,sha256=EYifJUUzN2Z2-iL37eJiNZF_DB0H4pa0mKlgYRIxM70,838
|
145
|
-
sglang/srt/models/mixtral.py,sha256=
|
148
|
+
sglang/srt/models/mixtral.py,sha256=L2Gz-Cmih1V75Ks9jmI2a6rUQ1Cl6F2uDgrhDjjDJzs,14523
|
146
149
|
sglang/srt/models/mixtral_quant.py,sha256=uuVO1nWUZJiDhbqZN6gzSMwyfpyZorMuFXHeMCGo7N0,14022
|
147
150
|
sglang/srt/models/mllama.py,sha256=3kX-UqeTSYZL5kPNdkfKEAEv3DpSAW1ArAAoeiXVzIc,37739
|
148
151
|
sglang/srt/models/olmo.py,sha256=OCDMtX1OI83r80mzU4FMC3Tg8cleQ-7C8Tpoe8zgzss,11708
|
149
152
|
sglang/srt/models/olmo2.py,sha256=aC7svioN7XT5owRxPrvhvWBNMON9QXGQBWJ1KHMyXeA,13442
|
150
|
-
sglang/srt/models/olmoe.py,sha256=
|
153
|
+
sglang/srt/models/olmoe.py,sha256=LiHVGfRaC5c_BU_vVgtV9uLuDH_SC0dw1kEc61posmI,15351
|
151
154
|
sglang/srt/models/phi3_small.py,sha256=44_my3QmgJ2N7SOkGZzEb62DXBeCVHojfmCWgkk2uCI,14802
|
152
155
|
sglang/srt/models/qwen.py,sha256=_FKDbwaS5C07uJyyivZpBrXJVej4Ph9ivzJdzWJPxJ4,9904
|
153
156
|
sglang/srt/models/qwen2.py,sha256=be4xgcuqNa9kBdaL7x3PjsnUky6fh5K33c_khAWSi04,12959
|
154
|
-
sglang/srt/models/qwen2_moe.py,sha256=
|
157
|
+
sglang/srt/models/qwen2_moe.py,sha256=6xRRJxWWh1M5UFPfvhsCpY477zv-30AeSRJXsvOkgFc,16542
|
155
158
|
sglang/srt/models/qwen2_vl.py,sha256=3EaUlTbyWOTRXA7eViK1WqmVbCFhXLIpnos49zzf-yM,26561
|
156
159
|
sglang/srt/models/registry.py,sha256=inKh9iwOp3LFYm3nqujg-OtABClOP-ifc1stA9cZegA,3434
|
157
160
|
sglang/srt/models/stablelm.py,sha256=iBlIkM7CQmqI25nsujWk0LLCQD7TshzUU8qzZYYrt20,11311
|
158
161
|
sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9RzpR1YZKGg4,18758
|
159
162
|
sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
|
160
|
-
sglang/srt/models/xverse_moe.py,sha256=
|
163
|
+
sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
|
161
164
|
sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
|
162
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
163
|
-
sglang/srt/openai_api/protocol.py,sha256=
|
165
|
+
sglang/srt/openai_api/adapter.py,sha256=X0HLuNhg-chDQjcdsQIRpZijlImEwZLHum3G0JgU4Go,54834
|
166
|
+
sglang/srt/openai_api/protocol.py,sha256=RMzeDfh2tZITjhNwB2nX68wZwQe40N6HBuVebCzEWiU,10468
|
164
167
|
sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
|
165
|
-
sglang/srt/sampling/sampling_params.py,sha256=
|
168
|
+
sglang/srt/sampling/sampling_params.py,sha256=BkgCJAOSmQXwJrNXg26zSjKfMy0d5mMN6oHRk_ZuESI,5499
|
166
169
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
167
170
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
|
168
171
|
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
|
@@ -180,12 +183,13 @@ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWU
|
|
180
183
|
sglang/test/simple_eval_mgsm.py,sha256=rd7TSUyxdKbrXaVoewo24V8lCo_6kO8zxPhhmvylpw8,10259
|
181
184
|
sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
|
182
185
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
186
|
+
sglang/test/test_block_fp8.py,sha256=rhrIun8aW5zq2qvuGRlo7F7aZ_upjVxtQMVlyc2Th_E,11771
|
183
187
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
184
188
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
185
189
|
sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
|
186
190
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
|
187
|
-
sglang-0.4.
|
188
|
-
sglang-0.4.
|
189
|
-
sglang-0.4.
|
190
|
-
sglang-0.4.
|
191
|
-
sglang-0.4.
|
191
|
+
sglang-0.4.1.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
192
|
+
sglang-0.4.1.post1.dist-info/METADATA,sha256=R2YDOrUU_49x5TEbNUODNlXvkSIzFqT7-hvInlSCs5k,22527
|
193
|
+
sglang-0.4.1.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
194
|
+
sglang-0.4.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
195
|
+
sglang-0.4.1.post1.dist-info/RECORD,,
|