sglang 0.4.5__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. sglang/__init__.py +2 -4
  2. sglang/bench_one_batch.py +23 -2
  3. sglang/bench_serving.py +6 -4
  4. sglang/lang/backend/anthropic.py +0 -4
  5. sglang/lang/backend/base_backend.py +1 -1
  6. sglang/lang/backend/openai.py +1 -1
  7. sglang/lang/backend/vertexai.py +0 -1
  8. sglang/lang/compiler.py +1 -7
  9. sglang/lang/tracer.py +3 -7
  10. sglang/srt/_custom_ops.py +0 -2
  11. sglang/srt/configs/model_config.py +37 -5
  12. sglang/srt/constrained/base_grammar_backend.py +26 -5
  13. sglang/srt/constrained/llguidance_backend.py +1 -0
  14. sglang/srt/constrained/outlines_backend.py +1 -0
  15. sglang/srt/constrained/outlines_jump_forward.py +14 -1
  16. sglang/srt/constrained/reasoner_grammar_backend.py +101 -0
  17. sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
  18. sglang/srt/constrained/xgrammar_backend.py +27 -4
  19. sglang/srt/custom_op.py +0 -62
  20. sglang/srt/disaggregation/base/__init__.py +8 -0
  21. sglang/srt/disaggregation/base/conn.py +113 -0
  22. sglang/srt/disaggregation/decode.py +80 -11
  23. sglang/srt/disaggregation/mini_lb.py +58 -123
  24. sglang/srt/disaggregation/mooncake/__init__.py +6 -0
  25. sglang/srt/disaggregation/mooncake/conn.py +585 -0
  26. sglang/srt/disaggregation/mooncake/transfer_engine.py +77 -0
  27. sglang/srt/disaggregation/prefill.py +82 -22
  28. sglang/srt/disaggregation/utils.py +46 -0
  29. sglang/srt/entrypoints/EngineBase.py +53 -0
  30. sglang/srt/entrypoints/engine.py +36 -8
  31. sglang/srt/entrypoints/http_server.py +37 -8
  32. sglang/srt/entrypoints/http_server_engine.py +142 -0
  33. sglang/srt/entrypoints/verl_engine.py +42 -13
  34. sglang/srt/hf_transformers_utils.py +4 -0
  35. sglang/srt/layers/activation.py +6 -8
  36. sglang/srt/layers/attention/flashattention_backend.py +430 -257
  37. sglang/srt/layers/attention/flashinfer_backend.py +18 -9
  38. sglang/srt/layers/attention/torch_native_backend.py +6 -1
  39. sglang/srt/layers/attention/triton_backend.py +6 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
  41. sglang/srt/layers/attention/vision.py +1 -1
  42. sglang/srt/layers/dp_attention.py +2 -4
  43. sglang/srt/layers/elementwise.py +15 -2
  44. sglang/srt/layers/layernorm.py +1 -1
  45. sglang/srt/layers/linear.py +18 -3
  46. sglang/srt/layers/moe/ep_moe/layer.py +15 -29
  47. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +145 -118
  48. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/{E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +34 -34
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -34
  56. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  57. sglang/srt/layers/moe/router.py +7 -1
  58. sglang/srt/layers/moe/topk.py +63 -45
  59. sglang/srt/layers/parameter.py +0 -2
  60. sglang/srt/layers/quantization/__init__.py +13 -5
  61. sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  62. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +12 -2
  63. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -77
  64. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
  65. sglang/srt/layers/quantization/fp8.py +131 -136
  66. sglang/srt/layers/quantization/fp8_kernel.py +328 -46
  67. sglang/srt/layers/quantization/fp8_utils.py +206 -253
  68. sglang/srt/layers/quantization/kv_cache.py +43 -52
  69. sglang/srt/layers/quantization/modelopt_quant.py +271 -4
  70. sglang/srt/layers/quantization/moe_wna16.py +2 -0
  71. sglang/srt/layers/quantization/utils.py +5 -11
  72. sglang/srt/layers/quantization/w8a8_fp8.py +156 -4
  73. sglang/srt/layers/quantization/w8a8_int8.py +8 -7
  74. sglang/srt/layers/radix_attention.py +28 -1
  75. sglang/srt/layers/rotary_embedding.py +15 -3
  76. sglang/srt/layers/sampler.py +5 -10
  77. sglang/srt/lora/backend/base_backend.py +18 -2
  78. sglang/srt/lora/backend/flashinfer_backend.py +1 -1
  79. sglang/srt/lora/backend/triton_backend.py +1 -1
  80. sglang/srt/lora/layers.py +1 -1
  81. sglang/srt/lora/lora.py +1 -1
  82. sglang/srt/lora/lora_manager.py +1 -1
  83. sglang/srt/managers/detokenizer_manager.py +0 -1
  84. sglang/srt/managers/io_struct.py +255 -97
  85. sglang/srt/managers/mm_utils.py +7 -5
  86. sglang/srt/managers/multimodal_processor.py +0 -2
  87. sglang/srt/managers/multimodal_processors/base_processor.py +117 -79
  88. sglang/srt/managers/multimodal_processors/janus_pro.py +3 -1
  89. sglang/srt/managers/multimodal_processors/mllama4.py +21 -36
  90. sglang/srt/managers/schedule_batch.py +64 -25
  91. sglang/srt/managers/scheduler.py +80 -82
  92. sglang/srt/managers/tokenizer_manager.py +18 -3
  93. sglang/srt/managers/tp_worker.py +1 -0
  94. sglang/srt/mem_cache/hiradix_cache.py +5 -1
  95. sglang/srt/mem_cache/memory_pool.py +21 -3
  96. sglang/srt/metrics/collector.py +9 -0
  97. sglang/srt/model_executor/cuda_graph_runner.py +9 -6
  98. sglang/srt/model_executor/forward_batch_info.py +234 -15
  99. sglang/srt/model_executor/model_runner.py +67 -35
  100. sglang/srt/model_loader/loader.py +31 -4
  101. sglang/srt/model_loader/weight_utils.py +4 -2
  102. sglang/srt/models/baichuan.py +2 -0
  103. sglang/srt/models/bert.py +398 -0
  104. sglang/srt/models/chatglm.py +1 -0
  105. sglang/srt/models/commandr.py +1 -0
  106. sglang/srt/models/dbrx.py +1 -0
  107. sglang/srt/models/deepseek.py +2 -1
  108. sglang/srt/models/deepseek_nextn.py +74 -70
  109. sglang/srt/models/deepseek_v2.py +494 -366
  110. sglang/srt/models/exaone.py +1 -0
  111. sglang/srt/models/gemma.py +1 -0
  112. sglang/srt/models/gemma2.py +1 -0
  113. sglang/srt/models/gemma3_causal.py +1 -0
  114. sglang/srt/models/gpt2.py +1 -0
  115. sglang/srt/models/gpt_bigcode.py +1 -0
  116. sglang/srt/models/granite.py +1 -0
  117. sglang/srt/models/grok.py +1 -0
  118. sglang/srt/models/internlm2.py +1 -0
  119. sglang/srt/models/llama.py +6 -5
  120. sglang/srt/models/llama4.py +101 -34
  121. sglang/srt/models/minicpm.py +1 -0
  122. sglang/srt/models/minicpm3.py +30 -200
  123. sglang/srt/models/mixtral.py +1 -0
  124. sglang/srt/models/mixtral_quant.py +1 -0
  125. sglang/srt/models/mllama.py +51 -8
  126. sglang/srt/models/mllama4.py +102 -29
  127. sglang/srt/models/olmo.py +1 -0
  128. sglang/srt/models/olmo2.py +1 -0
  129. sglang/srt/models/olmoe.py +1 -0
  130. sglang/srt/models/phi3_small.py +1 -0
  131. sglang/srt/models/qwen.py +1 -0
  132. sglang/srt/models/qwen2.py +5 -1
  133. sglang/srt/models/qwen2_5_vl.py +35 -70
  134. sglang/srt/models/qwen2_moe.py +15 -13
  135. sglang/srt/models/qwen2_vl.py +27 -25
  136. sglang/srt/models/qwen3.py +335 -0
  137. sglang/srt/models/qwen3_moe.py +423 -0
  138. sglang/srt/models/stablelm.py +1 -0
  139. sglang/srt/models/xverse.py +1 -0
  140. sglang/srt/models/xverse_moe.py +1 -0
  141. sglang/srt/openai_api/adapter.py +4 -1
  142. sglang/srt/patch_torch.py +11 -0
  143. sglang/srt/reasoning_parser.py +0 -1
  144. sglang/srt/sampling/sampling_batch_info.py +2 -3
  145. sglang/srt/server_args.py +55 -19
  146. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +4 -4
  147. sglang/srt/speculative/eagle_utils.py +1 -11
  148. sglang/srt/speculative/eagle_worker.py +10 -9
  149. sglang/srt/utils.py +136 -10
  150. sglang/test/attention/test_flashattn_backend.py +259 -221
  151. sglang/test/attention/test_flashattn_mla_backend.py +285 -0
  152. sglang/test/attention/test_prefix_chunk_info.py +224 -0
  153. sglang/test/runners.py +5 -1
  154. sglang/test/test_block_fp8.py +224 -0
  155. sglang/test/test_custom_ops.py +1 -1
  156. sglang/test/test_utils.py +19 -8
  157. sglang/version.py +1 -1
  158. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +15 -5
  159. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +162 -147
  160. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
  161. sglang/lang/__init__.py +0 -0
  162. sglang/srt/disaggregation/conn.py +0 -81
  163. sglang/srt/lora/backend/__init__.py +0 -25
  164. sglang/srt/server.py +0 -18
  165. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
  166. {sglang-0.4.5.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0
@@ -7,10 +7,13 @@ import torch
7
7
  from sglang.srt.layers.activation import SiluAndMul
8
8
  from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
9
9
  from sglang.srt.layers.quantization.fp8_kernel import (
10
+ per_tensor_quant_mla_deep_gemm_masked_fp8,
11
+ per_tensor_quant_mla_fp8,
10
12
  per_token_group_quant_fp8,
11
13
  static_quant_fp8,
12
14
  w8a8_block_fp8_matmul,
13
15
  )
16
+ from sglang.srt.layers.quantization.fp8_utils import input_to_float8
14
17
  from sglang.test.test_utils import CustomTestCase
15
18
 
16
19
  _is_cuda = torch.cuda.is_available() and torch.version.cuda
@@ -155,6 +158,117 @@ class TestStaticQuantFP8(CustomTestCase):
155
158
  self._static_quant_fp8(*params)
156
159
 
157
160
 
161
+ class TestPerTensorQuantMlaFP8(CustomTestCase):
162
+ DTYPES = [torch.half, torch.bfloat16, torch.float32]
163
+ NUM_TOKENS = [7, 83, 2048]
164
+ D = [512, 4096, 5120, 13824]
165
+ LAST_D_EXT = [1024, 0]
166
+ LAST_D = [512]
167
+ SEEDS = [0]
168
+
169
+ @classmethod
170
+ def setUpClass(cls):
171
+ if not torch.cuda.is_available():
172
+ raise unittest.SkipTest("CUDA is not available")
173
+ torch.set_default_device("cuda")
174
+
175
+ def _per_tensor_quant_mla_fp8(self, num_tokens, d, last_d_ext, last_d, dtype, seed):
176
+ torch.manual_seed(seed)
177
+
178
+ x = torch.rand(
179
+ (num_tokens, d // last_d, last_d + last_d_ext),
180
+ dtype=dtype,
181
+ )
182
+ x_sub, _ = x.split([last_d, last_d_ext], dim=-1)
183
+
184
+ with torch.inference_mode():
185
+ ref_out, ref_s = input_to_float8(x_sub.transpose(0, 1))
186
+ out, out_s = per_tensor_quant_mla_fp8(x_sub.transpose(0, 1))
187
+
188
+ self.assertTrue(out.is_contiguous())
189
+ self.assertTrue(
190
+ torch.allclose(out.to(torch.float32), ref_out.to(torch.float32), rtol=0.50)
191
+ )
192
+ self.assertTrue(
193
+ torch.allclose(out_s.to(torch.float32), ref_s.to(torch.float32))
194
+ )
195
+
196
+ def test_per_tensor_quant_mla_fp8(self):
197
+ for params in itertools.product(
198
+ self.NUM_TOKENS,
199
+ self.D,
200
+ self.LAST_D_EXT,
201
+ self.LAST_D,
202
+ self.DTYPES,
203
+ self.SEEDS,
204
+ ):
205
+ with self.subTest(
206
+ num_tokens=params[0],
207
+ d=params[1],
208
+ last_d_ext=params[2],
209
+ last_d=params[3],
210
+ dtype=params[4],
211
+ seed=params[5],
212
+ ):
213
+ self._per_tensor_quant_mla_fp8(*params)
214
+
215
+
216
+ class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
217
+ DTYPES = [torch.half, torch.bfloat16, torch.float32]
218
+ B = [128]
219
+ NUM_TOKENS = [7, 83, 2048, 1024 * 16]
220
+ D = [512, 128]
221
+ GROUP_SIZE = [128]
222
+ SEEDS = [0]
223
+
224
+ @classmethod
225
+ def setUpClass(cls):
226
+ if not torch.cuda.is_available():
227
+ raise unittest.SkipTest("CUDA is not available")
228
+ torch.set_default_device("cuda")
229
+
230
+ def _per_token_group_quant_mla_deep_gemm_masked_fp8(
231
+ self, b, num_tokens, d, dtype, group_size, seed
232
+ ):
233
+ torch.manual_seed(seed)
234
+
235
+ x = torch.rand(b, num_tokens, d, dtype=dtype)
236
+
237
+ with torch.inference_mode():
238
+ ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
239
+ out, scale, _, _, _ = per_tensor_quant_mla_deep_gemm_masked_fp8(
240
+ x, group_size
241
+ )
242
+ out = out[:, :num_tokens, :]
243
+ scale = scale[:, :num_tokens, :]
244
+
245
+ self.assertTrue(
246
+ torch.allclose(
247
+ out.to(torch.float32), ref_out.to(torch.float32), rtol=0.20, atol=1e-2
248
+ )
249
+ )
250
+ self.assertTrue(torch.allclose(scale, ref_scale))
251
+
252
+ def test_per_token_group_quant_mla_deep_gemm_masked_fp8(self):
253
+ for params in itertools.product(
254
+ self.B,
255
+ self.NUM_TOKENS,
256
+ self.D,
257
+ self.DTYPES,
258
+ self.GROUP_SIZE,
259
+ self.SEEDS,
260
+ ):
261
+ with self.subTest(
262
+ b=params[0],
263
+ num_tokens=params[1],
264
+ d=params[2],
265
+ dtype=params[3],
266
+ group_size=params[4],
267
+ seed=params[5],
268
+ ):
269
+ self._per_token_group_quant_mla_deep_gemm_masked_fp8(*params)
270
+
271
+
158
272
  # For test
159
273
  def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
160
274
  """This function performs matrix multiplication with block-wise quantization using native torch.
@@ -428,5 +542,115 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
428
542
  self._w8a8_block_fp8_fused_moe(*params)
429
543
 
430
544
 
545
+ # For test
546
+ def torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_shape, out_dtype):
547
+ """This function performs bmm with block-wise quantization using native torch."""
548
+
549
+ B, N, _ = w.shape
550
+ _, M, _ = a.shape
551
+ out = torch.empty((B, M, N), dtype=out_dtype, device=a.device)
552
+
553
+ for i in range(B):
554
+ out[i] = native_w8a8_block_fp8_matmul(
555
+ a[i], w[i], a_s[i], w_s[i], block_shape, output_dtype=out_dtype
556
+ )
557
+
558
+ return out
559
+
560
+
561
+ class TestW8A8BlockFP8BatchedDeepGemm(CustomTestCase):
562
+ DTYPES = [torch.bfloat16]
563
+ M = [1, 33, 64, 222, 8192]
564
+ N = [128, 512]
565
+ K = [128, 512]
566
+ BATCH = [128]
567
+ BLOCK_SIZE = [[128, 128]]
568
+ SEEDS = [0]
569
+
570
+ @classmethod
571
+ def setUpClass(cls):
572
+ if not torch.cuda.is_available():
573
+ raise unittest.SkipTest("CUDA is not available")
574
+ try:
575
+ import deep_gemm
576
+ except ImportError:
577
+ raise unittest.SkipTest("DeepGEMM is not available")
578
+ torch.set_default_device("cuda")
579
+
580
+ def _w8a8_block_fp8_batched_deep_gemm(self, M, N, K, B, block_size, dtype, seed):
581
+ torch.manual_seed(seed)
582
+ factor_for_scale = 1e-2
583
+ fp8_info = torch.finfo(torch.float8_e4m3fn)
584
+ fp8_max, fp8_min = fp8_info.max, fp8_info.min
585
+
586
+ a_fp32 = torch.randn((B, M, K), dtype=torch.float32) / 10
587
+ a = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
588
+
589
+ w_fp32 = (torch.rand((B, N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
590
+ w = w_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
591
+
592
+ block_n, block_k = block_size[0], block_size[1]
593
+ n_tiles_w = (N + block_n - 1) // block_n
594
+ k_tiles_w = (K + block_k - 1) // block_k
595
+
596
+ w_s = (
597
+ torch.rand((B, n_tiles_w, k_tiles_w), dtype=torch.float32)
598
+ * factor_for_scale
599
+ )
600
+ a_s = torch.rand((B, M, k_tiles_w), dtype=torch.float32) * factor_for_scale
601
+
602
+ ae = a.new_empty(B, (M + 255) // 256 * 256, K)
603
+ ae_s = a_s.new_empty(B, (M + 255) // 256 * 256, k_tiles_w)
604
+ oe = torch.empty((B, (M + 255) // 256 * 256, N), dtype=dtype)
605
+ ae[:, :M, :] = a
606
+ ae_s[:, :M, :] = a_s
607
+
608
+ masked_m = torch.full((B,), M, dtype=torch.int)
609
+ expected_m = M
610
+ lhs = (
611
+ ae,
612
+ ae_s,
613
+ )
614
+ rhs = (
615
+ w,
616
+ w_s,
617
+ )
618
+
619
+ from deep_gemm import m_grouped_gemm_fp8_fp8_bf16_nt_masked
620
+
621
+ with torch.inference_mode():
622
+ ref_out = torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_size, dtype)
623
+ m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs, rhs, oe, masked_m, expected_m)
624
+ out = oe[:, :M, :]
625
+
626
+ self.assertTrue(
627
+ torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
628
+ / torch.mean(torch.abs(ref_out.to(torch.float32)))
629
+ < 0.0001
630
+ )
631
+
632
+ def test_w8a8_block_fp8_batched_deep_gemm(self):
633
+
634
+ for params in itertools.product(
635
+ self.M,
636
+ self.N,
637
+ self.K,
638
+ self.BATCH,
639
+ self.BLOCK_SIZE,
640
+ self.DTYPES,
641
+ self.SEEDS,
642
+ ):
643
+ with self.subTest(
644
+ M=params[0],
645
+ N=params[1],
646
+ K=params[2],
647
+ B=params[3],
648
+ block_size=params[4],
649
+ dtype=params[5],
650
+ seed=params[6],
651
+ ):
652
+ self._w8a8_block_fp8_batched_deep_gemm(*params)
653
+
654
+
431
655
  if __name__ == "__main__":
432
656
  unittest.main(verbosity=2)
@@ -3,7 +3,7 @@
3
3
  import pytest
4
4
  import torch
5
5
 
6
- from sglang.srt.custom_op import scaled_fp8_quant
6
+ from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
7
7
  from sglang.srt.utils import is_cuda
8
8
 
9
9
 
sglang/test/test_utils.py CHANGED
@@ -25,7 +25,12 @@ from sglang.bench_serving import run_benchmark
25
25
  from sglang.global_config import global_config
26
26
  from sglang.lang.backend.openai import OpenAI
27
27
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
28
- from sglang.srt.utils import get_bool_env_var, kill_process_tree, retry
28
+ from sglang.srt.utils import (
29
+ get_bool_env_var,
30
+ is_port_available,
31
+ kill_process_tree,
32
+ retry,
33
+ )
29
34
  from sglang.test.run_eval import run_eval
30
35
  from sglang.utils import get_exception_traceback
31
36
 
@@ -37,11 +42,6 @@ DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
37
42
  DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
38
43
  "nvidia/Llama-3.1-8B-Instruct-FP8"
39
44
  )
40
- # TODO(yundai424): right now specifying to an older revision since the latest one
41
- # carries kv cache quantization which doesn't work yet
42
- DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION = (
43
- "13858565416dbdc0b4e7a4a677fadfbd5b9e5bb9"
44
- )
45
45
 
46
46
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
47
47
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
@@ -103,6 +103,17 @@ def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None)
103
103
  return pred
104
104
 
105
105
 
106
+ def find_available_port(base_port: int):
107
+ port = base_port + random.randint(100, 1000)
108
+ while True:
109
+ if is_port_available(port):
110
+ return port
111
+ if port < 60000:
112
+ port += 42
113
+ else:
114
+ port -= 43
115
+
116
+
106
117
  def call_generate_vllm(prompt, temperature, max_tokens, stop=None, n=1, url=None):
107
118
  assert url is not None
108
119
 
@@ -674,8 +685,6 @@ def run_bench_one_batch(model, other_args):
674
685
  "python3",
675
686
  "-m",
676
687
  "sglang.bench_one_batch",
677
- "--model-path",
678
- model,
679
688
  "--batch-size",
680
689
  "1",
681
690
  "--input",
@@ -684,6 +693,8 @@ def run_bench_one_batch(model, other_args):
684
693
  "8",
685
694
  *[str(x) for x in other_args],
686
695
  ]
696
+ if model is not None:
697
+ command += ["--model-path", model]
687
698
  process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
688
699
 
689
700
  try:
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.5"
1
+ __version__ = "0.4.5.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.5
3
+ Version: 0.4.5.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -239,20 +239,30 @@ Requires-Dist: python-multipart; extra == "runtime-common"
239
239
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
240
240
  Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
241
241
  Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
242
- Requires-Dist: transformers==4.51.0; extra == "runtime-common"
242
+ Requires-Dist: transformers==4.51.1; extra == "runtime-common"
243
243
  Requires-Dist: uvicorn; extra == "runtime-common"
244
244
  Requires-Dist: uvloop; extra == "runtime-common"
245
245
  Requires-Dist: compressed-tensors; extra == "runtime-common"
246
246
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
247
247
  Provides-Extra: srt
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
249
- Requires-Dist: sgl-kernel==0.0.8; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
250
250
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
251
251
  Requires-Dist: torch==2.5.1; extra == "srt"
252
+ Requires-Dist: torchvision==0.20.1; extra == "srt"
252
253
  Requires-Dist: cuda-python; extra == "srt"
253
254
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
254
255
  Requires-Dist: partial_json_parser; extra == "srt"
255
256
  Requires-Dist: einops; extra == "srt"
257
+ Provides-Extra: blackwell
258
+ Requires-Dist: sglang[runtime_common]; extra == "blackwell"
259
+ Requires-Dist: sgl-kernel; extra == "blackwell"
260
+ Requires-Dist: torch; extra == "blackwell"
261
+ Requires-Dist: torchvision; extra == "blackwell"
262
+ Requires-Dist: cuda-python; extra == "blackwell"
263
+ Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "blackwell"
264
+ Requires-Dist: partial_json_parser; extra == "blackwell"
265
+ Requires-Dist: einops; extra == "blackwell"
256
266
  Provides-Extra: srt-hip
257
267
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
258
268
  Requires-Dist: torch; extra == "srt-hip"
@@ -371,7 +381,7 @@ SGLang is a fast serving framework for large language models and vision language
371
381
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
372
382
  The core features include:
373
383
 
374
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
384
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
375
385
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
376
386
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
377
387
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
@@ -391,7 +401,7 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
391
401
 
392
402
  ## Adoption and Sponsorship
393
403
  The project has been deployed to large-scale production, generating trillions of tokens every day.
394
- It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
404
+ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Iflytek, Jam & Tea Studios, LinkedIn, LMSYS, Meituan, Nebius, Novita AI, NVIDIA, Oracle, RunPod, Stanford, UC Berkeley, UCLA, xAI, and 01.AI.
395
405
 
396
406
  <img src="https://raw.githubusercontent.com/sgl-project/sgl-learning-materials/main/slides/adoption.png" alt="logo" width="800" margin="10px"></img>
397
407