sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (152) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_one_batch.py +8 -6
  4. sglang/bench_serving.py +1 -1
  5. sglang/lang/interpreter.py +40 -1
  6. sglang/lang/ir.py +27 -0
  7. sglang/math_utils.py +8 -0
  8. sglang/srt/_custom_ops.py +2 -2
  9. sglang/srt/code_completion_parser.py +2 -44
  10. sglang/srt/configs/model_config.py +6 -0
  11. sglang/srt/constants.py +3 -0
  12. sglang/srt/conversation.py +19 -3
  13. sglang/srt/custom_op.py +5 -1
  14. sglang/srt/disaggregation/base/__init__.py +1 -1
  15. sglang/srt/disaggregation/base/conn.py +25 -11
  16. sglang/srt/disaggregation/common/__init__.py +5 -1
  17. sglang/srt/disaggregation/common/utils.py +42 -0
  18. sglang/srt/disaggregation/decode.py +211 -72
  19. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
  20. sglang/srt/disaggregation/fake/__init__.py +1 -1
  21. sglang/srt/disaggregation/fake/conn.py +15 -9
  22. sglang/srt/disaggregation/mini_lb.py +34 -4
  23. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  24. sglang/srt/disaggregation/mooncake/conn.py +30 -29
  25. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  26. sglang/srt/disaggregation/nixl/conn.py +17 -12
  27. sglang/srt/disaggregation/prefill.py +144 -55
  28. sglang/srt/disaggregation/utils.py +155 -123
  29. sglang/srt/distributed/parallel_state.py +12 -4
  30. sglang/srt/entrypoints/engine.py +37 -29
  31. sglang/srt/entrypoints/http_server.py +153 -72
  32. sglang/srt/entrypoints/http_server_engine.py +0 -3
  33. sglang/srt/entrypoints/openai/__init__.py +0 -0
  34. sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
  35. sglang/srt/entrypoints/openai/serving_base.py +149 -0
  36. sglang/srt/entrypoints/openai/serving_chat.py +921 -0
  37. sglang/srt/entrypoints/openai/serving_completions.py +424 -0
  38. sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
  39. sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
  40. sglang/srt/entrypoints/openai/serving_score.py +61 -0
  41. sglang/srt/entrypoints/openai/usage_processor.py +81 -0
  42. sglang/srt/entrypoints/openai/utils.py +72 -0
  43. sglang/srt/eplb_simulator/__init__.py +1 -0
  44. sglang/srt/eplb_simulator/reader.py +51 -0
  45. sglang/srt/function_call/base_format_detector.py +7 -4
  46. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  47. sglang/srt/function_call/ebnf_composer.py +64 -10
  48. sglang/srt/function_call/function_call_parser.py +6 -6
  49. sglang/srt/function_call/llama32_detector.py +1 -1
  50. sglang/srt/function_call/mistral_detector.py +1 -1
  51. sglang/srt/function_call/pythonic_detector.py +1 -1
  52. sglang/srt/function_call/qwen25_detector.py +1 -1
  53. sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
  54. sglang/srt/layers/activation.py +40 -3
  55. sglang/srt/layers/attention/aiter_backend.py +20 -4
  56. sglang/srt/layers/attention/base_attn_backend.py +1 -1
  57. sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
  58. sglang/srt/layers/attention/flashattention_backend.py +71 -72
  59. sglang/srt/layers/attention/flashinfer_backend.py +10 -8
  60. sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
  61. sglang/srt/layers/attention/flashmla_backend.py +7 -12
  62. sglang/srt/layers/attention/tbo_backend.py +3 -3
  63. sglang/srt/layers/attention/triton_backend.py +138 -130
  64. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  65. sglang/srt/layers/attention/vision.py +51 -24
  66. sglang/srt/layers/communicator.py +28 -10
  67. sglang/srt/layers/dp_attention.py +11 -2
  68. sglang/srt/layers/layernorm.py +29 -2
  69. sglang/srt/layers/linear.py +0 -4
  70. sglang/srt/layers/logits_processor.py +2 -14
  71. sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
  72. sglang/srt/layers/moe/ep_moe/layer.py +249 -33
  73. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
  76. sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
  77. sglang/srt/layers/moe/topk.py +107 -12
  78. sglang/srt/layers/pooler.py +56 -0
  79. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
  80. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  81. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
  82. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  83. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  84. sglang/srt/layers/quantization/fp8.py +25 -17
  85. sglang/srt/layers/quantization/fp8_kernel.py +44 -15
  86. sglang/srt/layers/quantization/fp8_utils.py +87 -22
  87. sglang/srt/layers/quantization/modelopt_quant.py +62 -8
  88. sglang/srt/layers/quantization/utils.py +5 -2
  89. sglang/srt/layers/radix_attention.py +2 -3
  90. sglang/srt/layers/rotary_embedding.py +42 -2
  91. sglang/srt/layers/sampler.py +1 -1
  92. sglang/srt/lora/lora_manager.py +249 -105
  93. sglang/srt/lora/mem_pool.py +53 -50
  94. sglang/srt/lora/utils.py +1 -1
  95. sglang/srt/managers/cache_controller.py +33 -14
  96. sglang/srt/managers/io_struct.py +31 -10
  97. sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
  98. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  99. sglang/srt/managers/schedule_batch.py +79 -37
  100. sglang/srt/managers/schedule_policy.py +70 -56
  101. sglang/srt/managers/scheduler.py +220 -79
  102. sglang/srt/managers/template_manager.py +226 -0
  103. sglang/srt/managers/tokenizer_manager.py +40 -10
  104. sglang/srt/managers/tp_worker.py +12 -2
  105. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  106. sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
  107. sglang/srt/mem_cache/base_prefix_cache.py +52 -8
  108. sglang/srt/mem_cache/chunk_cache.py +11 -15
  109. sglang/srt/mem_cache/hiradix_cache.py +38 -25
  110. sglang/srt/mem_cache/memory_pool.py +213 -505
  111. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  112. sglang/srt/mem_cache/radix_cache.py +56 -28
  113. sglang/srt/model_executor/cuda_graph_runner.py +198 -100
  114. sglang/srt/model_executor/forward_batch_info.py +32 -10
  115. sglang/srt/model_executor/model_runner.py +28 -12
  116. sglang/srt/model_loader/loader.py +16 -2
  117. sglang/srt/model_loader/weight_utils.py +11 -2
  118. sglang/srt/models/bert.py +113 -13
  119. sglang/srt/models/deepseek_nextn.py +29 -27
  120. sglang/srt/models/deepseek_v2.py +213 -173
  121. sglang/srt/models/glm4.py +312 -0
  122. sglang/srt/models/internvl.py +46 -102
  123. sglang/srt/models/mimo_mtp.py +2 -18
  124. sglang/srt/models/roberta.py +117 -9
  125. sglang/srt/models/vila.py +305 -0
  126. sglang/srt/reasoning_parser.py +21 -11
  127. sglang/srt/sampling/sampling_batch_info.py +24 -0
  128. sglang/srt/sampling/sampling_params.py +2 -0
  129. sglang/srt/server_args.py +351 -238
  130. sglang/srt/speculative/build_eagle_tree.py +1 -1
  131. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
  132. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
  133. sglang/srt/speculative/eagle_utils.py +468 -116
  134. sglang/srt/speculative/eagle_worker.py +258 -84
  135. sglang/srt/torch_memory_saver_adapter.py +19 -15
  136. sglang/srt/two_batch_overlap.py +4 -2
  137. sglang/srt/utils.py +235 -11
  138. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  139. sglang/test/runners.py +38 -3
  140. sglang/test/test_block_fp8.py +1 -0
  141. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  142. sglang/test/test_block_fp8_ep.py +2 -0
  143. sglang/test/test_utils.py +4 -1
  144. sglang/utils.py +9 -0
  145. sglang/version.py +1 -1
  146. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
  147. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
  148. sglang/srt/entrypoints/verl_engine.py +0 -179
  149. sglang/srt/openai_api/adapter.py +0 -1990
  150. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
  151. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
  152. {sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,252 @@
1
+ import itertools
2
+ import os
3
+ import unittest
4
+ from typing import List, Tuple
5
+
6
+ import torch
7
+ from deep_gemm import fp8_gemm_nt
8
+
9
+ from sglang.test.test_utils import CustomTestCase
10
+
11
+ _is_cuda = torch.cuda.is_available() and torch.version.cuda
12
+
13
+
14
+ # Modify form DeepGEMM Blackwell
15
+ def ceil_div(x: int, y: int) -> int:
16
+ return (x + y - 1) // y
17
+
18
+
19
+ def align(x: int, y: int) -> int:
20
+ return ceil_div(x, y) * y
21
+
22
+
23
+ def per_token_group_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
24
+ assert x.dim() == 2 and x.size(1) % 128 == 0
25
+ m, n = x.shape
26
+ x_view = x.view(m, -1, 128)
27
+ x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
28
+ sf = x_amax / 448.0
29
+ return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
30
+
31
+
32
+ def per_block_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ assert x.dim() == 2
34
+ m, n = x.shape
35
+ x_padded = torch.zeros(
36
+ (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
37
+ )
38
+ x_padded[:m, :n] = x
39
+ x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
40
+ x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
41
+ sf = x_amax / 448.0
42
+ x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
43
+ return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
44
+ x_view.size(0), x_view.size(2)
45
+ )
46
+
47
+
48
+ def ceil_to_ue8m0(x: torch.Tensor):
49
+ assert x.view(-1).amax().item() > 0
50
+ return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
51
+
52
+
53
+ def per_token_group_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
54
+ assert x.dim() == 2 and x.size(1) % 128 == 0
55
+ m, n = x.shape
56
+ x_view = x.view(m, -1, 128)
57
+ x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
58
+ sf = ceil_to_ue8m0(x_amax / 448.0)
59
+ return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
60
+
61
+
62
+ def per_block_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
63
+ assert x.dim() == 2
64
+ m, n = x.shape
65
+ x_padded = torch.zeros(
66
+ (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
67
+ )
68
+ x_padded[:m, :n] = x
69
+ x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
70
+ x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
71
+ sf = ceil_to_ue8m0(x_amax / 448.0)
72
+ x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
73
+ return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
74
+ x_view.size(0), x_view.size(2)
75
+ )
76
+
77
+
78
+ # For test
79
+ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
80
+ """This function performs matrix multiplication with block-wise quantization using native torch.
81
+
82
+ It takes two input tensors `A` and `B` with scales `As` and `Bs`.
83
+ The output is returned in the specified `output_dtype`.
84
+ """
85
+
86
+ A = A.to(torch.float32)
87
+ B = B.to(torch.float32)
88
+ assert A.shape[-1] == B.shape[-1]
89
+ assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
90
+ assert len(block_size) == 2
91
+ block_n, block_k = block_size[0], block_size[1]
92
+ assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
93
+ assert A.shape[:-1] == As.shape[:-1]
94
+
95
+ M = A.numel() // A.shape[-1]
96
+ N, K = B.shape
97
+ origin_C_shape = A.shape[:-1] + (N,)
98
+ A = A.reshape(M, A.shape[-1])
99
+ As = As.reshape(M, As.shape[-1])
100
+ n_tiles = (N + block_n - 1) // block_n
101
+ k_tiles = (K + block_k - 1) // block_k
102
+ assert n_tiles == Bs.shape[0]
103
+ assert k_tiles == Bs.shape[1]
104
+
105
+ C_shape = (M, N)
106
+ C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
107
+
108
+ A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
109
+ B_tiles = [
110
+ [
111
+ B[
112
+ j * block_n : min((j + 1) * block_n, N),
113
+ i * block_k : min((i + 1) * block_k, K),
114
+ ]
115
+ for i in range(k_tiles)
116
+ ]
117
+ for j in range(n_tiles)
118
+ ]
119
+ C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
120
+ As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
121
+
122
+ for i in range(k_tiles):
123
+ for j in range(n_tiles):
124
+ a = A_tiles[i]
125
+ b = B_tiles[j][i]
126
+ c = C_tiles[j]
127
+ s = As_tiles[i] * Bs[j][i]
128
+ c[:, :] += torch.matmul(a, b.t()) * s
129
+
130
+ C = C.reshape(origin_C_shape).to(output_dtype)
131
+ return C
132
+
133
+
134
+ def block_quant_dequant(
135
+ x_q_block: torch.Tensor,
136
+ x_s: torch.Tensor,
137
+ block_size: List[int],
138
+ dtype: torch.dtype,
139
+ ) -> torch.Tensor:
140
+ """This function converts block-wise quantization to unquantized.
141
+ The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
142
+ and the block size.
143
+ The output is an unquantized tensor with dtype.
144
+ """
145
+ block_n, block_k = block_size[0], block_size[1]
146
+ n, k = x_q_block.shape
147
+ n_tiles = (n + block_n - 1) // block_n
148
+ k_tiles = (k + block_k - 1) // block_k
149
+ assert n_tiles == x_s.shape[0]
150
+ assert k_tiles == x_s.shape[1]
151
+
152
+ x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
153
+
154
+ for j in range(n_tiles):
155
+ for i in range(k_tiles):
156
+ x_q_block_tile = x_q_block[
157
+ j * block_n : min((j + 1) * block_n, n),
158
+ i * block_k : min((i + 1) * block_k, k),
159
+ ]
160
+ x_dq_block_tile = x_dq_block[
161
+ j * block_n : min((j + 1) * block_n, n),
162
+ i * block_k : min((i + 1) * block_k, k),
163
+ ]
164
+ x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
165
+
166
+ return x_dq_block
167
+
168
+
169
+ class TestDeepGemmBlackwell(CustomTestCase):
170
+
171
+ if not _is_cuda:
172
+ OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
173
+ M = [1, 7, 83, 512, 2048]
174
+ NKs = [
175
+ (N, K)
176
+ for N in [128, 512, 1024, 4096, 7748, 13824]
177
+ for K in [256, 4096, 5120, 3884, 13824]
178
+ ]
179
+ # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
180
+ BLOCK_SIZE = [[128, 128]]
181
+ SEEDS = [0]
182
+ else:
183
+ # use practical shape in DeepSeek V3 for test
184
+ OUT_DTYPES = [torch.bfloat16]
185
+ M = [64, 128, 512, 1024, 4096]
186
+ NKs = [
187
+ (2112, 7168),
188
+ (1536, 7168),
189
+ # (3072, 1536),
190
+ # (24576, 7168),
191
+ # (4096, 512),
192
+ # (7168, 2048),
193
+ # (4608, 7168),
194
+ # (512, 7168),
195
+ # (7168, 2304),
196
+ # (7168, 512),
197
+ ]
198
+ BLOCK_SIZE = [[128, 128]]
199
+ SEEDS = [0]
200
+
201
+ @classmethod
202
+ def setUpClass(cls):
203
+ if not torch.cuda.is_available():
204
+ raise unittest.SkipTest("CUDA is not available")
205
+ torch.set_default_device("cuda")
206
+
207
+ def _test_deep_gemm_blackwell(self, M, NK, block_size, out_dtype, seed):
208
+ N, K = NK
209
+ torch.manual_seed(seed)
210
+
211
+ A = torch.empty((M, K), dtype=torch.bfloat16).normal_(0, 0.2)
212
+ B = torch.empty((N, K), dtype=torch.bfloat16).normal_(0, 0.2)
213
+
214
+ A_q, A_s = per_token_group_quant_fp8(A)
215
+ B_q, B_s = per_block_quant_fp8(B)
216
+
217
+ A_dq = block_quant_dequant(A_q, A_s, [1, block_size[1]], out_dtype)
218
+ B_dq = block_quant_dequant(B_q, B_s, block_size, out_dtype)
219
+
220
+ A_qu = per_token_group_quant_mxfp8(A_dq)
221
+ B_qu = per_block_quant_mxfp8(B_dq)
222
+ out = None
223
+
224
+ with torch.inference_mode():
225
+ ref_out = native_w8a8_block_fp8_matmul(
226
+ A_q, B_q, A_s, B_s, block_size, out_dtype
227
+ )
228
+ out = torch.empty_like(ref_out)
229
+ fp8_gemm_nt(A_qu, B_qu, out)
230
+
231
+ torch.testing.assert_close(out, ref_out, atol=1e-1, rtol=1e-2)
232
+
233
+ def test_deep_gemm_blackwell(self):
234
+ for params in itertools.product(
235
+ self.M,
236
+ self.NKs,
237
+ self.BLOCK_SIZE,
238
+ self.OUT_DTYPES,
239
+ self.SEEDS,
240
+ ):
241
+ with self.subTest(
242
+ M=params[0],
243
+ NKs=params[1],
244
+ block_size=params[2],
245
+ out_dtype=params[3],
246
+ seed=params[4],
247
+ ):
248
+ self._test_deep_gemm_blackwell(*params)
249
+
250
+
251
+ if __name__ == "__main__":
252
+ unittest.main(verbosity=2)
@@ -84,6 +84,7 @@ def ep_moe(
84
84
  top_k,
85
85
  hidden_states.shape[1],
86
86
  BLOCK_SIZE=512,
87
+ use_per_token_if_dynamic=True,
87
88
  )
88
89
 
89
90
  seg_indptr_cur_rank = seg_indptr[start_expert_id : end_expert_id + 2]
@@ -181,6 +182,7 @@ def ep_moe(
181
182
  end_expert_id,
182
183
  top_k,
183
184
  hidden_states.size(1),
185
+ 0,
184
186
  BLOCK_SIZE=512,
185
187
  )
186
188
  return output
sglang/test/test_utils.py CHANGED
@@ -37,10 +37,13 @@ from sglang.utils import get_exception_traceback
37
37
  # General test models
38
38
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
39
39
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
40
+ DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
40
41
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
41
42
  DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
42
43
 
43
44
  # MLA test models
45
+ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
46
+ DEFAULT_SMALL_CROSS_ENCODER_MODEL_NAME_FOR_TEST = "cross-encoder/ms-marco-MiniLM-L6-v2"
44
47
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
45
48
  DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
46
49
  DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
@@ -85,7 +88,7 @@ DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST = "Qwen/Qwen2.5-VL-3B-Instruct"
85
88
  DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
86
89
  DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
87
90
 
88
- DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
91
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
89
92
 
90
93
 
91
94
  def is_in_ci():
sglang/utils.py CHANGED
@@ -512,3 +512,12 @@ async def async_stream_and_merge(llm, prompt, sampling_params):
512
512
  cleaned_chunk = trim_overlap(final_text, chunk_text)
513
513
  final_text += cleaned_chunk
514
514
  yield cleaned_chunk # yield the non-overlapping portion
515
+
516
+
517
+ def resolve_obj_by_qualname(qualname: str) -> Any:
518
+ """
519
+ Resolve an object by its fully qualified name.
520
+ """
521
+ module_name, obj_name = qualname.rsplit(".", 1)
522
+ module = importlib.import_module(module_name)
523
+ return getattr(module, obj_name)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.7"
1
+ __version__ = "0.4.8"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.7
3
+ Version: 0.4.8
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -230,6 +230,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
230
230
  Requires-Dist: msgspec; extra == "runtime-common"
231
231
  Requires-Dist: ninja; extra == "runtime-common"
232
232
  Requires-Dist: orjson; extra == "runtime-common"
233
+ Requires-Dist: outlines==0.1.11; extra == "runtime-common"
233
234
  Requires-Dist: packaging; extra == "runtime-common"
234
235
  Requires-Dist: partial_json_parser; extra == "runtime-common"
235
236
  Requires-Dist: pillow; extra == "runtime-common"
@@ -248,14 +249,13 @@ Requires-Dist: uvloop; extra == "runtime-common"
248
249
  Requires-Dist: xgrammar==0.1.19; extra == "runtime-common"
249
250
  Provides-Extra: srt
250
251
  Requires-Dist: sglang[runtime_common]; extra == "srt"
251
- Requires-Dist: sgl-kernel==0.1.7; extra == "srt"
252
- Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
252
+ Requires-Dist: sgl-kernel==0.1.9; extra == "srt"
253
253
  Requires-Dist: torch==2.7.1; extra == "srt"
254
254
  Requires-Dist: torchaudio==2.7.1; extra == "srt"
255
255
  Requires-Dist: torchvision==0.22.1; extra == "srt"
256
256
  Requires-Dist: cuda-python; extra == "srt"
257
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
258
257
  Requires-Dist: einops; extra == "srt"
258
+ Requires-Dist: flashinfer_python==0.2.6.post1; extra == "srt"
259
259
  Provides-Extra: blackwell
260
260
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
261
261
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -263,27 +263,21 @@ Requires-Dist: torch==2.7.1; extra == "blackwell"
263
263
  Requires-Dist: torchaudio==2.7.1; extra == "blackwell"
264
264
  Requires-Dist: torchvision==0.22.1; extra == "blackwell"
265
265
  Requires-Dist: cuda-python; extra == "blackwell"
266
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
267
266
  Requires-Dist: einops; extra == "blackwell"
268
267
  Requires-Dist: flashinfer_python==0.2.6.post1; extra == "blackwell"
269
268
  Provides-Extra: srt-hip
270
269
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
271
270
  Requires-Dist: torch; extra == "srt-hip"
272
271
  Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
273
- Requires-Dist: outlines==0.1.11; extra == "srt-hip"
274
272
  Provides-Extra: srt-xpu
275
273
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
276
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-xpu"
277
274
  Provides-Extra: srt-hpu
278
275
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
279
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-hpu"
280
276
  Provides-Extra: srt-cpu
281
277
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
282
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-cpu"
283
278
  Requires-Dist: einops; extra == "srt-cpu"
284
279
  Provides-Extra: srt-npu
285
280
  Requires-Dist: sglang[runtime_common]; extra == "srt-npu"
286
- Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt-npu"
287
281
  Provides-Extra: openai
288
282
  Requires-Dist: openai>=1.0; extra == "openai"
289
283
  Requires-Dist: tiktoken; extra == "openai"
@@ -292,7 +286,7 @@ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
292
286
  Provides-Extra: litellm
293
287
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
294
288
  Provides-Extra: torch-memory-saver
295
- Requires-Dist: torch_memory_saver>=0.0.4; extra == "torch-memory-saver"
289
+ Requires-Dist: torch_memory_saver>=0.0.8; extra == "torch-memory-saver"
296
290
  Provides-Extra: decord
297
291
  Requires-Dist: decord; extra == "decord"
298
292
  Provides-Extra: test
@@ -371,7 +365,7 @@ Dynamic: license-file
371
365
 
372
366
  --------------------------------------------------------------------------------
373
367
 
374
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
368
+ | [**Blog**](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
375
369
  | [**Documentation**](https://docs.sglang.ai/)
376
370
  | [**Join Slack**](https://slack.sglang.ai/)
377
371
  | [**Join Bi-Weekly Development Meeting**](https://meeting.sglang.ai/)
@@ -403,7 +397,7 @@ SGLang is a fast serving framework for large language models and vision language
403
397
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
404
398
  The core features include:
405
399
 
406
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
400
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, prefill-decode disaggregation, speculative decoding, continuous batching, paged attention, tensor parallelism, pipeline parallelism, expert parallelism, structured outputs, chunked prefill, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
407
401
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
408
402
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, Qwen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
409
403
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -422,7 +416,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
422
416
  [Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
423
417
 
424
418
  ## Adoption and Sponsorship
425
- SGLang has been deployed at large scale, serving trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
419
+ SGLang has been deployed at large scale, generating trillions of tokens in production every day. It is trusted and adopted by a broad range of leading enterprises and institutions, including xAI, NVIDIA, AMD, Google Cloud, Oracle Cloud, LinkedIn, Cursor, Voltage Park, Atlas Cloud, DataCrunch, Baseten, Nebius, Novita, InnoMatrix, RunPod, Stanford, UC Berkeley, UCLA, ETCHED, Jam & Tea Studios, Hyperbolic, as well as major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto standard in the industry, with production deployments running on over 100,000 GPUs worldwide.
426
420
 
427
421
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/refs/heads/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
428
422