sglang 0.4.0.post2__py3-none-any.whl → 0.4.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. sglang/bench_offline_throughput.py +0 -12
  2. sglang/bench_one_batch.py +0 -12
  3. sglang/bench_serving.py +11 -2
  4. sglang/lang/backend/openai.py +10 -0
  5. sglang/srt/aio_rwlock.py +100 -0
  6. sglang/srt/configs/model_config.py +8 -1
  7. sglang/srt/constrained/xgrammar_backend.py +6 -0
  8. sglang/srt/layers/attention/flashinfer_backend.py +49 -5
  9. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -14
  10. sglang/srt/layers/linear.py +20 -2
  11. sglang/srt/layers/{ep_moe → moe/ep_moe}/layer.py +14 -39
  12. sglang/srt/layers/moe/fused_moe_native.py +46 -0
  13. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/__init__.py +3 -7
  14. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/fused_moe.py +124 -99
  15. sglang/srt/layers/{fused_moe_triton → moe/fused_moe_triton}/layer.py +16 -48
  16. sglang/srt/layers/moe/topk.py +205 -0
  17. sglang/srt/layers/quantization/__init__.py +3 -3
  18. sglang/srt/layers/quantization/fp8.py +169 -32
  19. sglang/srt/layers/quantization/fp8_kernel.py +292 -0
  20. sglang/srt/layers/quantization/fp8_utils.py +90 -1
  21. sglang/srt/layers/torchao_utils.py +11 -15
  22. sglang/srt/managers/schedule_batch.py +16 -10
  23. sglang/srt/managers/schedule_policy.py +1 -1
  24. sglang/srt/managers/scheduler.py +13 -16
  25. sglang/srt/managers/tokenizer_manager.py +130 -111
  26. sglang/srt/mem_cache/memory_pool.py +15 -8
  27. sglang/srt/model_executor/cuda_graph_runner.py +1 -1
  28. sglang/srt/model_loader/loader.py +22 -11
  29. sglang/srt/models/dbrx.py +1 -1
  30. sglang/srt/models/deepseek.py +1 -1
  31. sglang/srt/models/deepseek_v2.py +67 -18
  32. sglang/srt/models/gemma2.py +19 -0
  33. sglang/srt/models/grok.py +1 -1
  34. sglang/srt/models/llama.py +2 -2
  35. sglang/srt/models/mixtral.py +2 -2
  36. sglang/srt/models/olmoe.py +1 -1
  37. sglang/srt/models/qwen2_moe.py +1 -1
  38. sglang/srt/models/xverse_moe.py +1 -1
  39. sglang/srt/openai_api/adapter.py +23 -0
  40. sglang/srt/openai_api/protocol.py +2 -0
  41. sglang/srt/sampling/sampling_params.py +9 -2
  42. sglang/srt/server.py +21 -37
  43. sglang/srt/utils.py +33 -44
  44. sglang/test/test_block_fp8.py +341 -0
  45. sglang/version.py +1 -1
  46. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/METADATA +4 -4
  47. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/RECORD +52 -48
  48. sglang/srt/layers/fused_moe_patch.py +0 -133
  49. /sglang/srt/layers/{ep_moe → moe/ep_moe}/__init__.py +0 -0
  50. /sglang/srt/layers/{ep_moe → moe/ep_moe}/kernels.py +0 -0
  51. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/LICENSE +0 -0
  52. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/WHEEL +0 -0
  53. {sglang-0.4.0.post2.dist-info → sglang-0.4.1.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,341 @@
1
+ import itertools
2
+ import unittest
3
+
4
+ import torch
5
+
6
+ from sglang.srt.layers.activation import SiluAndMul
7
+ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
8
+ from sglang.srt.layers.quantization.fp8_kernel import (
9
+ per_token_group_quant_fp8,
10
+ w8a8_block_fp8_matmul,
11
+ )
12
+
13
+
14
+ # For test
15
+ def native_per_token_group_quant_fp8(
16
+ x, group_size, eps=1e-10, dtype=torch.float8_e4m3fn
17
+ ):
18
+ """Function to perform per-token-group quantization on an input tensor `x` using native torch.
19
+
20
+ It converts the tensor values into float8 values and returns the
21
+ quantized tensor along with the scaling factor used for quantization.
22
+ Note that only `torch.float8_e4m3fn` is supported for now.
23
+ """
24
+ assert (
25
+ x.shape[-1] % group_size == 0
26
+ ), "the last dimension of `x` cannot be divisible by `group_size`"
27
+ assert x.is_contiguous(), "`x` is not contiguous"
28
+
29
+ finfo = torch.finfo(dtype)
30
+ fp8_min = finfo.min
31
+ fp8_max = finfo.max
32
+
33
+ x_ = x.reshape(x.numel() // group_size, group_size)
34
+ amax = x_.abs().max(dim=-1, keepdim=True)[0].clamp(min=eps).to(torch.float32)
35
+ x_s = amax / fp8_max
36
+ x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
37
+ x_q = x_q.reshape(x.shape)
38
+ x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size,))
39
+
40
+ return x_q, x_s
41
+
42
+
43
+ class TestPerTokenGroupQuantFP8(unittest.TestCase):
44
+ DTYPES = [torch.half, torch.bfloat16, torch.float32]
45
+ NUM_TOKENS = [7, 83, 2048]
46
+ D = [512, 4096, 5120, 13824]
47
+ GROUP_SIZE = [64, 128, 256, 512]
48
+ SEEDS = [0]
49
+
50
+ @classmethod
51
+ def setUpClass(cls):
52
+ if not torch.cuda.is_available():
53
+ raise unittest.SkipTest("CUDA is not available")
54
+ torch.set_default_device("cuda")
55
+
56
+ def _per_token_group_quant_fp8(self, num_tokens, d, dtype, group_size, seed):
57
+ torch.manual_seed(seed)
58
+
59
+ x = torch.rand(num_tokens, d, dtype=dtype)
60
+
61
+ with torch.inference_mode():
62
+ ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
63
+ out, scale = per_token_group_quant_fp8(x, group_size)
64
+
65
+ self.assertTrue(
66
+ torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.15)
67
+ )
68
+ self.assertTrue(torch.allclose(scale, ref_scale))
69
+
70
+ def test_per_token_group_quant_fp8(self):
71
+ for params in itertools.product(
72
+ self.NUM_TOKENS,
73
+ self.D,
74
+ self.DTYPES,
75
+ self.GROUP_SIZE,
76
+ self.SEEDS,
77
+ ):
78
+ with self.subTest(
79
+ num_tokens=params[0],
80
+ d=params[1],
81
+ dtype=params[2],
82
+ group_size=params[3],
83
+ seed=params[4],
84
+ ):
85
+ self._per_token_group_quant_fp8(*params)
86
+
87
+
88
+ # For test
89
+ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
90
+ """This function performs matrix multiplication with block-wise quantization using native torch.
91
+
92
+ It takes two input tensors `A` and `B` with scales `As` and `Bs`.
93
+ The output is returned in the specified `output_dtype`.
94
+ """
95
+
96
+ A = A.to(torch.float32)
97
+ B = B.to(torch.float32)
98
+ assert A.shape[-1] == B.shape[-1]
99
+ assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
100
+ assert len(block_size) == 2
101
+ block_n, block_k = block_size[0], block_size[1]
102
+ assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
103
+ assert A.shape[:-1] == As.shape[:-1]
104
+
105
+ M = A.numel() // A.shape[-1]
106
+ N, K = B.shape
107
+ origin_C_shape = A.shape[:-1] + (N,)
108
+ A = A.reshape(M, A.shape[-1])
109
+ As = As.reshape(M, As.shape[-1])
110
+ n_tiles = (N + block_n - 1) // block_n
111
+ k_tiles = (K + block_k - 1) // block_k
112
+ assert n_tiles == Bs.shape[0]
113
+ assert k_tiles == Bs.shape[1]
114
+
115
+ C_shape = (M, N)
116
+ C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
117
+
118
+ A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
119
+ B_tiles = [
120
+ [
121
+ B[
122
+ j * block_n : min((j + 1) * block_n, N),
123
+ i * block_k : min((i + 1) * block_k, K),
124
+ ]
125
+ for i in range(k_tiles)
126
+ ]
127
+ for j in range(n_tiles)
128
+ ]
129
+ C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
130
+ As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
131
+
132
+ for i in range(k_tiles):
133
+ for j in range(n_tiles):
134
+ a = A_tiles[i]
135
+ b = B_tiles[j][i]
136
+ c = C_tiles[j]
137
+ s = As_tiles[i] * Bs[j][i]
138
+ c[:, :] += torch.matmul(a, b.t()) * s
139
+
140
+ C = C.reshape(origin_C_shape).to(output_dtype)
141
+ return C
142
+
143
+
144
+ class TestW8A8BlockFP8Matmul(unittest.TestCase):
145
+ OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
146
+ M = [1, 7, 83, 512, 2048]
147
+ N = [128, 512, 1024, 4096, 7748, 13824]
148
+ K = [256, 4096, 5120, 3884, 13824]
149
+ # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
150
+ BLOCK_SIZE = [[128, 128]]
151
+ SEEDS = [0]
152
+
153
+ @classmethod
154
+ def setUpClass(cls):
155
+ if not torch.cuda.is_available():
156
+ raise unittest.SkipTest("CUDA is not available")
157
+ torch.set_default_device("cuda")
158
+
159
+ def _w8a8_block_fp8_matmul(self, M, N, K, block_size, out_dtype, seed):
160
+ torch.manual_seed(seed)
161
+ # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
162
+ factor_for_scale = 1e-2
163
+ fp8_info = torch.finfo(torch.float8_e4m3fn)
164
+ fp8_max, fp8_min = fp8_info.max, fp8_info.min
165
+
166
+ A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
167
+ A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
168
+
169
+ B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
170
+ B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
171
+
172
+ block_n, block_k = block_size[0], block_size[1]
173
+ n_tiles = (N + block_n - 1) // block_n
174
+ k_tiles = (K + block_k - 1) // block_k
175
+
176
+ As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
177
+ Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
178
+
179
+ with torch.inference_mode():
180
+ ref_out = native_w8a8_block_fp8_matmul(
181
+ A_fp8, B_fp8, As, Bs, block_size, out_dtype
182
+ )
183
+ out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
184
+
185
+ self.assertTrue(
186
+ torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
187
+ / torch.mean(torch.abs(ref_out.to(torch.float32)))
188
+ < 0.001
189
+ )
190
+
191
+ def test_w8a8_block_fp8_matmul(self):
192
+ for params in itertools.product(
193
+ self.M,
194
+ self.N,
195
+ self.K,
196
+ self.BLOCK_SIZE,
197
+ self.OUT_DTYPES,
198
+ self.SEEDS,
199
+ ):
200
+ with self.subTest(
201
+ M=params[0],
202
+ N=params[1],
203
+ K=params[2],
204
+ block_size=params[3],
205
+ out_dtype=params[4],
206
+ seed=params[5],
207
+ ):
208
+ self._w8a8_block_fp8_matmul(*params)
209
+
210
+
211
+ # For test
212
+ def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
213
+ """This function performs fused moe with block-wise quantization using native torch."""
214
+
215
+ B, D = a.shape
216
+ a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
217
+ out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
218
+ score = torch.softmax(score, dim=-1, dtype=torch.float32)
219
+ topk_weight, topk_ids = torch.topk(score, topk)
220
+ topk_weight = topk_weight.view(-1)
221
+ topk_ids = topk_ids.view(-1)
222
+
223
+ _, block_k = block_shape[0], block_shape[1]
224
+ a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
225
+ # NOTE(HandH1998): Since "index_cuda" not implemented for 'Float8_e4m3fn', we need to cast `float8`` to `float32``.
226
+ a_q = a_q.to(torch.float32)
227
+ for i in range(w1.shape[0]):
228
+ mask = topk_ids == i
229
+ if mask.sum():
230
+ inter_out = native_w8a8_block_fp8_matmul(
231
+ a_q[mask], w1[i], a_s[mask], w1_s[i], block_shape, output_dtype=a.dtype
232
+ )
233
+ act_out = SiluAndMul().forward_native(inter_out)
234
+ act_out_q, act_out_s = native_per_token_group_quant_fp8(act_out, block_k)
235
+ act_out = act_out.to(torch.float32)
236
+ out[mask] = native_w8a8_block_fp8_matmul(
237
+ act_out_q, w2[i], act_out_s, w2_s[i], block_shape, output_dtype=a.dtype
238
+ )
239
+ return (
240
+ out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
241
+ ).sum(dim=1)
242
+
243
+
244
+ class TestW8A8BlockFP8FusedMoE(unittest.TestCase):
245
+ DTYPES = [torch.float32, torch.half, torch.bfloat16]
246
+ M = [1, 33, 64, 222, 1024 * 128]
247
+ N = [128, 1024, 2048]
248
+ K = [256, 4096, 5120]
249
+ E = [8, 24]
250
+ TOP_KS = [2, 6]
251
+ BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
252
+ # BLOCK_SIZE = [[128, 128]]
253
+ SEEDS = [0]
254
+
255
+ @classmethod
256
+ def setUpClass(cls):
257
+ if not torch.cuda.is_available():
258
+ raise unittest.SkipTest("CUDA is not available")
259
+ torch.set_default_device("cuda")
260
+
261
+ def _w8a8_block_fp8_fused_moe(self, M, N, K, E, topk, block_size, dtype, seed):
262
+ torch.manual_seed(seed)
263
+ # NOTE(HandH1998): to avoid overflow when out_dtype = torch.half
264
+ factor_for_scale = 1e-2
265
+ fp8_info = torch.finfo(torch.float8_e4m3fn)
266
+ fp8_max, fp8_min = fp8_info.max, fp8_info.min
267
+
268
+ a = torch.randn((M, K), dtype=dtype) / 10
269
+
270
+ w1_fp32 = (torch.rand((E, 2 * N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
271
+ w1 = w1_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
272
+
273
+ w2_fp32 = (torch.rand((E, K, N), dtype=torch.float32) - 0.5) * 2 * fp8_max
274
+ w2 = w2_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
275
+
276
+ block_n, block_k = block_size[0], block_size[1]
277
+ n_tiles_w1 = (2 * N + block_n - 1) // block_n
278
+ n_tiles_w2 = (K + block_n - 1) // block_n
279
+ k_tiles_w1 = (K + block_k - 1) // block_k
280
+ k_tiles_w2 = (N + block_k - 1) // block_k
281
+
282
+ w1_s = (
283
+ torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
284
+ * factor_for_scale
285
+ )
286
+ w2_s = (
287
+ torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
288
+ * factor_for_scale
289
+ )
290
+
291
+ score = torch.randn((M, E), dtype=dtype)
292
+
293
+ with torch.inference_mode():
294
+ out = fused_moe(
295
+ a,
296
+ w1,
297
+ w2,
298
+ score,
299
+ topk,
300
+ renormalize=False,
301
+ use_fp8_w8a8=True,
302
+ w1_scale=w1_s,
303
+ w2_scale=w2_s,
304
+ block_shape=block_size,
305
+ )
306
+ ref_out = torch_w8a8_block_fp8_moe(
307
+ a, w1, w2, w1_s, w2_s, score, topk, block_size
308
+ )
309
+
310
+ self.assertTrue(
311
+ torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
312
+ / torch.mean(torch.abs(ref_out.to(torch.float32)))
313
+ < 0.02
314
+ )
315
+
316
+ def test_w8a8_block_fp8_fused_moe(self):
317
+ for params in itertools.product(
318
+ self.M,
319
+ self.N,
320
+ self.K,
321
+ self.E,
322
+ self.TOP_KS,
323
+ self.BLOCK_SIZE,
324
+ self.DTYPES,
325
+ self.SEEDS,
326
+ ):
327
+ with self.subTest(
328
+ M=params[0],
329
+ N=params[1],
330
+ K=params[2],
331
+ E=params[3],
332
+ topk=params[4],
333
+ block_size=params[5],
334
+ dtype=params[6],
335
+ seed=params[7],
336
+ ):
337
+ self._w8a8_block_fp8_fused_moe(*params)
338
+
339
+
340
+ if __name__ == "__main__":
341
+ unittest.main(verbosity=2)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.0.post2"
1
+ __version__ = "0.4.1.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.4.0.post2
3
+ Version: 0.4.1.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -234,7 +234,6 @@ Requires-Dist: pydantic; extra == "runtime-common"
234
234
  Requires-Dist: python-multipart; extra == "runtime-common"
235
235
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
236
236
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
237
- Requires-Dist: gemlite; extra == "runtime-common"
238
237
  Requires-Dist: uvicorn; extra == "runtime-common"
239
238
  Requires-Dist: uvloop; extra == "runtime-common"
240
239
  Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
@@ -244,6 +243,7 @@ Requires-Dist: torch; extra == "srt"
244
243
  Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
245
244
  Requires-Dist: cuda-python; extra == "srt"
246
245
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
246
+ Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
@@ -358,8 +358,8 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
358
358
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
359
359
 
360
360
  ## Adoption and Sponsorship
361
- The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
361
+ The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, xAI, 01.AI.
362
362
 
363
363
  ## Acknowledgment and Citation
364
364
  We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
365
- Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
365
+ Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -1,17 +1,17 @@
1
1
  sglang/__init__.py,sha256=b2oIdWzp5P8SzieeOs2TzJoN3Do3tfJbV8gZS_imVcs,1619
2
2
  sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
3
3
  sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
4
- sglang/bench_offline_throughput.py,sha256=rgMWDhA1Hai0gKBzxc0dzTWfI8l39Cyw2VOCyMt1YyY,12771
5
- sglang/bench_one_batch.py,sha256=aF0onHeRjy7AYVjsq1IA3rZEhUuYXuslg1fAhuvJ2yo,16120
4
+ sglang/bench_offline_throughput.py,sha256=iQiJCK3KQDCdwU1NVbIwbtthssWzBXiIsKUDA7Z_hO0,12510
5
+ sglang/bench_one_batch.py,sha256=jkyMhK0lqn5dRCYgAh30qZrNHP4gAbXODymBMNXK86I,15859
6
6
  sglang/bench_one_batch_server.py,sha256=-fV9FTLNNcSIy0pgYeggXedPVK0fVsXZqVQswT8OMOY,5945
7
- sglang/bench_serving.py,sha256=zv_EcbWno79j7WYFL2m6BfCLT6iSOfGV4uwGbDg9KQA,53141
7
+ sglang/bench_serving.py,sha256=YQiCZreejCPBTqMmZsCB99RMi1N-Jx-dZtaafcQ8-14,53377
8
8
  sglang/check_env.py,sha256=4OqpZaEJOfBM6-vtPILto5kqDmgiZM1Koc7lK78A7CI,8427
9
9
  sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
10
10
  sglang/launch_server.py,sha256=4y2QeSj0wVNB9MJQZeahD4ahTDU6gwqo7MPUytyFop0,403
11
11
  sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
12
12
  sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
13
13
  sglang/utils.py,sha256=23jf4Mz8E5p5a6JOkjnfYZixdjZUk88F_mZ8rZcby5Q,11597
14
- sglang/version.py,sha256=OUNovuQ1RrdJFYetl0e0U0556H_wiyjhVks9-l-zF94,28
14
+ sglang/version.py,sha256=ARioq8ApVNckeQorLPVfHZeN9mlHMLbaNgLGNbGq-ys,28
15
15
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  sglang/lang/chat_template.py,sha256=cnfjjxIIcYRGRxXlJlOGnpFxFuhMHut7DS52LsOMKcA,15826
17
17
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -23,28 +23,29 @@ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSu
23
23
  sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
24
24
  sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
25
25
  sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
26
- sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
26
+ sglang/lang/backend/openai.py,sha256=ha9a2P6T80TmSgYlyIwB1qYawWkjcOgiOptkktkqa1U,15436
27
27
  sglang/lang/backend/runtime_endpoint.py,sha256=dfs-yZ1ekKmnbpZLluQHWPmMeZJKbaaZRRGYRa9eBE8,10541
28
28
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
29
29
  sglang/srt/_custom_ops.py,sha256=Y4gyTDGhWz-W2Igq25Ojm8XFiyvkawW9I-79iwYvxJ0,3574
30
+ sglang/srt/aio_rwlock.py,sha256=6LYtOdeTUY3hkfa1dmYkgsaF2ttrwIF3hUWz2AZ2fqw,2970
30
31
  sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21201
31
32
  sglang/srt/hf_transformers_utils.py,sha256=38Ms0H2-VMerOS6jnczcFtZMS6lhw9B5rSWKAfxVUfQ,7945
32
33
  sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
33
34
  sglang/srt/model_parallel.py,sha256=eLXZhvJ4wG6dh0FontNCIdVZvHYdWgaeY-5cu7TD9tE,6078
34
- sglang/srt/server.py,sha256=tEciMH_U6WIZYPUGDDM0c4BQ16cvgVdA4II-ksPZoMo,34621
35
+ sglang/srt/server.py,sha256=vDucJl6qtEK2swzPJ_wYitaJvsI4MigMagGlBlH5V54,34033
35
36
  sglang/srt/server_args.py,sha256=LgnQ-kBJZ3E7hMMZj9bSK0mn7Bhjk1nJHxLcxl-lGTM,34572
36
- sglang/srt/utils.py,sha256=WWEcMJHmvlOjiqE9UicT0ZYwa2PUKDZorAk2Y8PPRBI,42039
37
+ sglang/srt/utils.py,sha256=J8kFl6kDBwFZCM6AKaVTiqdhJKRg0JOH0pNrD1ZeWmM,41726
37
38
  sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
38
39
  sglang/srt/configs/device_config.py,sha256=dResqHjkg_dq10v6rnVpbXpvABZRB0jylOm-2_JAnx0,428
39
40
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
40
41
  sglang/srt/configs/load_config.py,sha256=TcPi_HY6xu5SiVZsxPOoB5pGeDUNebOk7muoUH9VBDg,3083
41
- sglang/srt/configs/model_config.py,sha256=OjEeigs5tMNKP-RImJk2NHVFXv-fyQfsGREWMO3rqhM,15839
42
+ sglang/srt/configs/model_config.py,sha256=vVarlLTw9Ged1PXIwRP-R8UhiG6oaezNIZhTNuF0eQc,16070
42
43
  sglang/srt/configs/qwen2vl.py,sha256=ZjLy9v2eZY4wptUfY3CWgYKg2B5DDrkfCSyTy_Zf_bg,4351
43
44
  sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
44
45
  sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
45
46
  sglang/srt/constrained/outlines_backend.py,sha256=CipNHNNXs8xtnJNVNe6FCwZUlSbIXbGmWVlZz3hUpFQ,6820
46
47
  sglang/srt/constrained/outlines_jump_forward.py,sha256=iZWXeR3gNYoMubLGyFmLPO4V2YsN5DiGjD71Xk9iFaE,6418
47
- sglang/srt/constrained/xgrammar_backend.py,sha256=4It9_GqU4UZFhxIw_7hkzpXaMPUtksk6Xfe0Agsfw7A,4620
48
+ sglang/srt/constrained/xgrammar_backend.py,sha256=76oUFXeB29bfnEVWa1-rIrwQm5jhuMlzAX10HtAq1fQ,4887
48
49
  sglang/srt/distributed/__init__.py,sha256=__tl9Frrf3PFrSyNYcn5i-y2rL-J4-Qn6RJwrsZ4xgc,83
49
50
  sglang/srt/distributed/communication_op.py,sha256=ZoIhboZyefiAwr-1K-wF3rAFSQ4Wt-RxXpsX443Gbt4,1157
50
51
  sglang/srt/distributed/parallel_state.py,sha256=HplRH5S0AWdwSdhoHYX9_UWQZlFjh2Z1LHaz68EXlpE,47555
@@ -60,35 +61,37 @@ sglang/srt/distributed/device_communicators/shm_broadcast.py,sha256=WVxBd1QfIgRW
60
61
  sglang/srt/distributed/device_communicators/xpu_communicator.py,sha256=P3WKgddcfpUhBa-_5PvjYxH146ZE-N1cotTzEpPRKlY,1620
61
62
  sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwxBQ,5471
62
63
  sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
63
- sglang/srt/layers/fused_moe_patch.py,sha256=DMIyrwOON7OSidKZdreL5HzMhP0AD5Ues0xdY-ADOQw,4471
64
64
  sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
65
- sglang/srt/layers/linear.py,sha256=dF2HvqiMbhWlCjvkLFRCcgUFGhG-B0keM_CIpjvgTtg,46154
65
+ sglang/srt/layers/linear.py,sha256=KyRFU0VcoNuN-hnQB9QQcBN9NCpeqPtLzzufIHUpV6w,47064
66
66
  sglang/srt/layers/logits_processor.py,sha256=JlOU0x8vBGIuTwHSdjR6Kly9_uzilBMv0NE_rvUx0W4,14747
67
67
  sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
68
68
  sglang/srt/layers/radix_attention.py,sha256=E4cmvkcCdCtb6VyLNrCKy1D6VwHQ063oH3JQXPaRy6w,2178
69
69
  sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
70
70
  sglang/srt/layers/sampler.py,sha256=k4Op_HMkQfT7t9wgQwBVotfTUXEocrzRyQqEFnff1pc,5511
71
- sglang/srt/layers/torchao_utils.py,sha256=07Fe2Csdh1JiQKPGGHWkbq0-a6bV7Cq136ygdtVAhgI,3708
71
+ sglang/srt/layers/torchao_utils.py,sha256=dQVuWNXxAvOPjr2G5BBMWqC2oKcS2B52rx-fEc_elmc,3545
72
72
  sglang/srt/layers/vocab_parallel_embedding.py,sha256=slGwLiWjuFLCUdRe-GTlfumyZpqVX9VF6No_UGOT-hA,21624
73
73
  sglang/srt/layers/attention/__init__.py,sha256=KIJhzOJWYioQE7Va4D83-V-ZUZVMZcczuNgDC3dlSRo,2583
74
74
  sglang/srt/layers/attention/double_sparsity_backend.py,sha256=RQdEKRykSLf9ilnaHmR6T7RFqh4emH_adfB3aJN2BUU,10920
75
- sglang/srt/layers/attention/flashinfer_backend.py,sha256=NgeigL1WiPOuOry0Gbxv-6HEcERB8Du0mBJgYcTVIAA,24943
75
+ sglang/srt/layers/attention/flashinfer_backend.py,sha256=umD1E2zvMnPbbgvx2Ex5LQB6a4a41brjsks1M0gFMMU,26357
76
76
  sglang/srt/layers/attention/torch_native_backend.py,sha256=nQdeqWEMMH_wrod5wssDCJG-uPKm0uslvkALKqPRPQ8,10509
77
77
  sglang/srt/layers/attention/triton_backend.py,sha256=-TobyZHwlbJ5HhbFg-jgCqVOw4Y-opgEuFo-EusASQc,6264
78
78
  sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=oJ_UK1t229zF3hbTDiQe7t-X-IbM2dOxx4U2ch-vmjA,17847
79
79
  sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
80
- sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=tZJhzqcf1KKMT8z7_32eVk_D1NHP71c-S3UNxemfAHM,11542
80
+ sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=DWOZXSTVN5ZbcFjDjcqs-nPdUkxSwum0SVXhVKqwh2g,11688
81
81
  sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
82
- sglang/srt/layers/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
- sglang/srt/layers/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
84
- sglang/srt/layers/ep_moe/layer.py,sha256=uMropMhU-MaycoxSLxcfD0jZC_cuL_boRbIu86mbZjY,23034
85
- sglang/srt/layers/fused_moe_triton/__init__.py,sha256=PHKFqd2hPOO-g9kSMseg2g76lpg9OGXQDThWU6bt9vs,902
86
- sglang/srt/layers/fused_moe_triton/fused_moe.py,sha256=fLGmkY6imJYjEw9-3-jJthkMcFGMBcu9HCNIuxAzMhE,29625
87
- sglang/srt/layers/fused_moe_triton/layer.py,sha256=eMpbZlP3FAQxbHochis7ybZ-fsNBP0PzKF1PN0Xo7so,21517
88
- sglang/srt/layers/quantization/__init__.py,sha256=FgNy_zNWMWnq3lEGyCSyfLSQtcZtWlq99JilkmEDW7I,4594
82
+ sglang/srt/layers/moe/fused_moe_native.py,sha256=8q-LFZMSCGLc2_Gltp2lH0gSb4A1WOuKQW3wo3rpj5g,1601
83
+ sglang/srt/layers/moe/topk.py,sha256=JpeIl_-CNk0yyG3k5fmmNbbmR2_9bkKC23UoLOlMkjw,6954
84
+ sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
+ sglang/srt/layers/moe/ep_moe/kernels.py,sha256=wb_S2qLxoWWgQu9coXy0XLNGvHzdZSdwXr0PGy4QySg,10940
86
+ sglang/srt/layers/moe/ep_moe/layer.py,sha256=6iQU5ZjQ8IXGoQ8ZlBuJqyQxYTEem9vXI6rbVIWKlZw,22303
87
+ sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
88
+ sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=zXwWUtthLa9E35EvlQ9A_mnIsQyA0_NYKsUBdJqONHo,31163
89
+ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=BclDj5JyCy-8Bfue4broL1-IG6a4dUyggE9WQLa06sg,20575
90
+ sglang/srt/layers/quantization/__init__.py,sha256=VPYXShHvbvkOgVBlkIqic4RhdJ1y6EZ3r34T-nZMT1k,4606
89
91
  sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
90
- sglang/srt/layers/quantization/fp8.py,sha256=3oIUPaD0PBXQyTKr44I0YJ8XXDdwyoS_-ZA97XdSxXE,24143
91
- sglang/srt/layers/quantization/fp8_utils.py,sha256=eJDLLDu8ZbrbE3BfFIf89JlIMPOP-14DesbeVsajW0Q,1035
92
+ sglang/srt/layers/quantization/fp8.py,sha256=wNnpXLroIl7D98mlfCiXZPE9hrP5ricHrXY1WZBzEEo,30810
93
+ sglang/srt/layers/quantization/fp8_kernel.py,sha256=eoO1enzD9jPC80id2oC3i8bt-LN6-4Ey223yOQ9yIPE,8792
94
+ sglang/srt/layers/quantization/fp8_utils.py,sha256=HBJBaNcln1NrLxzw0ppUjMd6w-ryuGDDHCYJq7mRQac,4035
92
95
  sglang/srt/lora/lora.py,sha256=-o2mBmUvoVpdkgdAkWTARN4kfyep3UNEJLcg6moh0SU,15056
93
96
  sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
94
97
  sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
@@ -96,44 +99,44 @@ sglang/srt/managers/data_parallel_controller.py,sha256=psI4FAuBGjtdnEuwagnGdtRqv
96
99
  sglang/srt/managers/detokenizer_manager.py,sha256=nZkbwt4yty_oy8rvg4T7PbgyVLoBLohvHl25xlQpBoo,8439
97
100
  sglang/srt/managers/image_processor.py,sha256=Y8RgyrzbJjJTpjbnZDa5qiiG5wWjZ68rOXUPDi6kkFo,13698
98
101
  sglang/srt/managers/io_struct.py,sha256=_LWWqT3LNwZGaWhg2d3kTg1V2MTHKzRasCvxF9Nfpi4,15429
99
- sglang/srt/managers/schedule_batch.py,sha256=SAd7sxhoC3Bp8_xd-TEcXEFZBlGZPbn8-wMvBcjU55Q,45607
100
- sglang/srt/managers/schedule_policy.py,sha256=cLNi__smbg02keWgUMfB_nEM3vllocPB0XyG1P5qO7I,15469
101
- sglang/srt/managers/scheduler.py,sha256=QlcVMtrLlNcBOkVISdO556jrK8a4LE4ULskC0oCH2IQ,61776
102
+ sglang/srt/managers/schedule_batch.py,sha256=qryPWCdOTFzxomDa80U-5guShOb1K4kBUWcPCCchYB8,45762
103
+ sglang/srt/managers/schedule_policy.py,sha256=QxjQ8-le062AMHHxool6CxkhvB4FIwhOQPzTX_JwL6U,15447
104
+ sglang/srt/managers/scheduler.py,sha256=Yh15uQFhJlku8a20-lhtIsiEHAcUmpL3BzL42kLVwiI,61637
102
105
  sglang/srt/managers/session_controller.py,sha256=Yp-IV3rXczACZxZXmF-QxW9CWICGy8KHQ9ttBGJ8WXA,2800
103
- sglang/srt/managers/tokenizer_manager.py,sha256=gnCCdB5XDobOoBKptwv-o0yYqDkMUxL78s0zBno5lM4,31219
106
+ sglang/srt/managers/tokenizer_manager.py,sha256=uKiTt__lCFXG60zQhmM_K7dU7IuedVSIQHVw3x3y5-E,31758
104
107
  sglang/srt/managers/tp_worker.py,sha256=X1EwFX3FSsmXx7jeeX2tjZRocaujabQYWm-M-0CFEBE,7363
105
108
  sglang/srt/managers/tp_worker_overlap_thread.py,sha256=-QNBJRKxraa9Xt2WI1AFzZYdneIJ1eXv0GjFzDqXoE0,8926
106
109
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=QC8HS8RC5DXu14kyXsxAgEUsn0f932p2DjqzbKjc6Bs,962
107
110
  sglang/srt/mem_cache/chunk_cache.py,sha256=R2gHAuqKd5ayQW3NnsgoGUH31---Z5izCDyCqLL0FjQ,2524
108
111
  sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
109
- sglang/srt/mem_cache/memory_pool.py,sha256=l9_srwXEfIIDF46nxykbHIOo1VSvU5_Ew3H0r5EC7Fo,11072
112
+ sglang/srt/mem_cache/memory_pool.py,sha256=oxk3UtiiFA3_1iIP6eFsk8HIcRI_8Z1-FE2KOWDr-YM,11366
110
113
  sglang/srt/mem_cache/radix_cache.py,sha256=c5voySV5L855c0G9cBEc9iQ4nR7PDDmg0V6fWWJHcq4,10945
111
114
  sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
112
115
  sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
113
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=kZ3nV03MD8EQYQB38u4_88_wyW4unECxAdMVICpPyuk,16241
116
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=1n5WxoE9-0B3unwkkcR355K_D290h2LGt_7EvH02DQM,16246
114
117
  sglang/srt/model_executor/forward_batch_info.py,sha256=L5mVoW5SaO6To-7nGk0TZM-FFB5_78cARpJ-aC2rwD0,12883
115
118
  sglang/srt/model_executor/model_runner.py,sha256=MLYBcYIQihu2I3PBTUghiU2mSWsDMzlKzcnX7yHa9JU,29837
116
119
  sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
117
- sglang/srt/model_loader/loader.py,sha256=VBrY4W9CiVvS_D8yXhdkW9jReV9rSMSkJplabz0Fxgk,43528
120
+ sglang/srt/model_loader/loader.py,sha256=7OG_8-66vFDFZ9kVKGNK1BFBjZ6ql449dlyvdCbMqvE,43876
118
121
  sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
119
122
  sglang/srt/model_loader/weight_utils.py,sha256=kQo9KPThjH3HAOCfC_tdwdrshdWuWJOVpPR0skSyaRY,24193
120
123
  sglang/srt/models/baichuan.py,sha256=PzBOFcEAixakPEkQSaJwC0Xc1fu-yCsN9T0I67r8QmY,14919
121
124
  sglang/srt/models/chatglm.py,sha256=DOrEhmb0s-yPId88R6nJeLOTUEtogk-vkB69qT2JdWc,12913
122
125
  sglang/srt/models/commandr.py,sha256=PNXgfOZF84h-rSH0edEECUmEGW8YLb44V75Z_oDhFiA,14223
123
- sglang/srt/models/dbrx.py,sha256=2Wqcf3sv57l4gi2xH8yrb5WSmY-4_kbbf6fhpJ4aKWw,14581
124
- sglang/srt/models/deepseek.py,sha256=BVNICGoLjQoHmR5lc31YrZ6YbxSRTBilHqlLsALr2u8,15693
125
- sglang/srt/models/deepseek_v2.py,sha256=YKSrqagVcSUwCAi-rwIph-Xu12GrNETMNKxgnffWod8,35349
126
+ sglang/srt/models/dbrx.py,sha256=okIpIwdr8Cfrz_thzc1F75XqCUfHhFLvZ1B6BaswKoA,14585
127
+ sglang/srt/models/deepseek.py,sha256=_cVOvR6eSEgRf6TUBpTD5uMdijDWFw4sSt4lGzl8tbg,15697
128
+ sglang/srt/models/deepseek_v2.py,sha256=-v_OJr2c3gJ0NMxQjvT3Jknz1XPGkzKx0TVR3NIiC6A,37284
126
129
  sglang/srt/models/exaone.py,sha256=dkERTZVxrRroqu5AGLP7D4N6n8HvDqlNaDQUIe15mZY,13038
127
130
  sglang/srt/models/gemma.py,sha256=ydRqsG-7004r1fAiz01LHUmcj_6XN0Tn4xO1keJnMQk,12126
128
- sglang/srt/models/gemma2.py,sha256=41PlW8pMb4rMETdAni_JWDhZeIn_QsTQireAyUjsURA,15848
131
+ sglang/srt/models/gemma2.py,sha256=-bFN-Te3YWAunLCrF-XFk_6fJS7gHM4Ca6h6aesXUTM,16362
129
132
  sglang/srt/models/gemma2_reward.py,sha256=nJ01KfqLSJtqMLm3sG8p2mGZFK1xhhjh7I7Ccb-_Hq8,2494
130
133
  sglang/srt/models/gpt2.py,sha256=2je1kE09sGcaORWnJuGYAkcwwOrT9EK-KhQaoCKjCSA,9517
131
134
  sglang/srt/models/gpt_bigcode.py,sha256=tovyOdJu2x3LkzmkdFXX_iJdkxuyChIDxwgvPBy6UPo,9528
132
135
  sglang/srt/models/granite.py,sha256=AeQY9Dxd1ZnwgCYBK0vSXXiMGM-yt9iaOVf_ruOUHXw,20409
133
- sglang/srt/models/grok.py,sha256=UWvVEYfEoH0jGNFSbXpO66OGW5pzmIHlNKcn9gRZEoQ,15664
136
+ sglang/srt/models/grok.py,sha256=J9lgNbFebvXgF19nfZyHwlGPlGWY_m0LgP506YvOYrU,15668
134
137
  sglang/srt/models/internlm2.py,sha256=_xcKtd6YtEFUTozaN-yUb0xbSYckRpomfPSKcAk4j-Y,12127
135
138
  sglang/srt/models/internlm2_reward.py,sha256=8K26A9oIFFGx_9U2mF87j7FX8K87HGKMnVL3ht1Uc7I,2398
136
- sglang/srt/models/llama.py,sha256=S7nS05hhFGghXu0v-w9RZyBTY6OCEVF5Aaw4GX_E_9g,19929
139
+ sglang/srt/models/llama.py,sha256=o3FYyOhkZJirzugyYz1kxs6RpY84O_uKowWWmt3jv24,19929
137
140
  sglang/srt/models/llama_classification.py,sha256=DwboM1xHXdf3Fddf7xGnrfdOLJwXdiJs994cIpAPa2g,2984
138
141
  sglang/srt/models/llama_embedding.py,sha256=rh-AiczPY_pTpzcACHvSMVjh1hsV_MZBBwP0LQxPsGM,3130
139
142
  sglang/srt/models/llama_reward.py,sha256=oPxh5E2UkxLULNdR68dFvt2I7j33CJFN6nyA-8L2_cg,4516
@@ -142,27 +145,27 @@ sglang/srt/models/llavavid.py,sha256=dYUkKfHoE15vF_VXA_s_ICCTUMSmSgvP181fk8dUi0g
142
145
  sglang/srt/models/minicpm.py,sha256=ws4AqhOfAvYHGd04QuXCZel-Oxy9_vN4p4rTjs9RSz0,13723
143
146
  sglang/srt/models/minicpm3.py,sha256=YIKJDTpwjmpLlv1sNT93k2yZMvGQlI_H87czjf6QYyo,24707
144
147
  sglang/srt/models/mistral.py,sha256=EYifJUUzN2Z2-iL37eJiNZF_DB0H4pa0mKlgYRIxM70,838
145
- sglang/srt/models/mixtral.py,sha256=vi6ssY75kNLy_kJrDru6gJYiAogHjSniaO6aMFd1w4E,14515
148
+ sglang/srt/models/mixtral.py,sha256=L2Gz-Cmih1V75Ks9jmI2a6rUQ1Cl6F2uDgrhDjjDJzs,14523
146
149
  sglang/srt/models/mixtral_quant.py,sha256=uuVO1nWUZJiDhbqZN6gzSMwyfpyZorMuFXHeMCGo7N0,14022
147
150
  sglang/srt/models/mllama.py,sha256=3kX-UqeTSYZL5kPNdkfKEAEv3DpSAW1ArAAoeiXVzIc,37739
148
151
  sglang/srt/models/olmo.py,sha256=OCDMtX1OI83r80mzU4FMC3Tg8cleQ-7C8Tpoe8zgzss,11708
149
152
  sglang/srt/models/olmo2.py,sha256=aC7svioN7XT5owRxPrvhvWBNMON9QXGQBWJ1KHMyXeA,13442
150
- sglang/srt/models/olmoe.py,sha256=Rw-3YrHWd90MZQFnmcfUQ-3wAaI0PCFKb0DIrCDND3s,15347
153
+ sglang/srt/models/olmoe.py,sha256=LiHVGfRaC5c_BU_vVgtV9uLuDH_SC0dw1kEc61posmI,15351
151
154
  sglang/srt/models/phi3_small.py,sha256=44_my3QmgJ2N7SOkGZzEb62DXBeCVHojfmCWgkk2uCI,14802
152
155
  sglang/srt/models/qwen.py,sha256=_FKDbwaS5C07uJyyivZpBrXJVej4Ph9ivzJdzWJPxJ4,9904
153
156
  sglang/srt/models/qwen2.py,sha256=be4xgcuqNa9kBdaL7x3PjsnUky6fh5K33c_khAWSi04,12959
154
- sglang/srt/models/qwen2_moe.py,sha256=rYUk_vZW3ftKIIlqPvJZ1K-6oZ_PfGspixh1zm2Y8C8,16538
157
+ sglang/srt/models/qwen2_moe.py,sha256=6xRRJxWWh1M5UFPfvhsCpY477zv-30AeSRJXsvOkgFc,16542
155
158
  sglang/srt/models/qwen2_vl.py,sha256=3EaUlTbyWOTRXA7eViK1WqmVbCFhXLIpnos49zzf-yM,26561
156
159
  sglang/srt/models/registry.py,sha256=inKh9iwOp3LFYm3nqujg-OtABClOP-ifc1stA9cZegA,3434
157
160
  sglang/srt/models/stablelm.py,sha256=iBlIkM7CQmqI25nsujWk0LLCQD7TshzUU8qzZYYrt20,11311
158
161
  sglang/srt/models/torch_native_llama.py,sha256=YeXHorFm6QfnczLXwPb5TG9a-He0uiA9RzpR1YZKGg4,18758
159
162
  sglang/srt/models/xverse.py,sha256=Oq--KqvbYu2H4TMVGEHpSnJLEwXBpxlncR9ilsQeckc,13579
160
- sglang/srt/models/xverse_moe.py,sha256=AawKEQw--oAl-yzwCjoaZRG7q3rdkyDiam3FS0zjf_c,15537
163
+ sglang/srt/models/xverse_moe.py,sha256=7E60YIST4ELYwLRgjtHiLRI5Uyc7XqQTM7jQXiWaQs4,15541
161
164
  sglang/srt/models/yivl.py,sha256=88OubtuZ38Dxb2LzfV_MTPBI4wKhh4NJqFu--efbhFM,4809
162
- sglang/srt/openai_api/adapter.py,sha256=dvKq4O3Rhd77ad6iCtPNykgnk9PVJE-E8wHVsBAfCQQ,53927
163
- sglang/srt/openai_api/protocol.py,sha256=ecRNNqkhwwKZaIoJlPhtp2VTcHxBJDbNN8lrKS7uBx8,10406
165
+ sglang/srt/openai_api/adapter.py,sha256=X0HLuNhg-chDQjcdsQIRpZijlImEwZLHum3G0JgU4Go,54834
166
+ sglang/srt/openai_api/protocol.py,sha256=RMzeDfh2tZITjhNwB2nX68wZwQe40N6HBuVebCzEWiU,10468
164
167
  sglang/srt/sampling/sampling_batch_info.py,sha256=s--zNjk-LErZ5lMqnZ7KiuJltaziKRbQAU5qYpKIxAc,8564
165
- sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
168
+ sglang/srt/sampling/sampling_params.py,sha256=BkgCJAOSmQXwJrNXg26zSjKfMy0d5mMN6oHRk_ZuESI,5499
166
169
  sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
167
170
  sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
168
171
  sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
@@ -180,12 +183,13 @@ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWU
180
183
  sglang/test/simple_eval_mgsm.py,sha256=rd7TSUyxdKbrXaVoewo24V8lCo_6kO8zxPhhmvylpw8,10259
181
184
  sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
182
185
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
186
+ sglang/test/test_block_fp8.py,sha256=rhrIun8aW5zq2qvuGRlo7F7aZ_upjVxtQMVlyc2Th_E,11771
183
187
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
184
188
  sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
185
189
  sglang/test/test_utils.py,sha256=HJG7kUQOk6n9FBbH89PDtQ41C3kt1cfJODhAEcFT0AQ,23823
186
190
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
187
- sglang-0.4.0.post2.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
188
- sglang-0.4.0.post2.dist-info/METADATA,sha256=maHXecD3U1DdhzfU2aBMhN96MQRqCBPsIA1KlO7t7dg,22512
189
- sglang-0.4.0.post2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
190
- sglang-0.4.0.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
191
- sglang-0.4.0.post2.dist-info/RECORD,,
191
+ sglang-0.4.1.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
192
+ sglang-0.4.1.post1.dist-info/METADATA,sha256=R2YDOrUU_49x5TEbNUODNlXvkSIzFqT7-hvInlSCs5k,22527
193
+ sglang-0.4.1.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
194
+ sglang-0.4.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
195
+ sglang-0.4.1.post1.dist-info/RECORD,,