sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. sglang/__init__.py +2 -4
  2. sglang/bench_one_batch.py +2 -2
  3. sglang/bench_serving.py +3 -6
  4. sglang/compile_deep_gemm.py +136 -0
  5. sglang/lang/backend/anthropic.py +0 -4
  6. sglang/lang/backend/base_backend.py +1 -1
  7. sglang/lang/backend/openai.py +6 -2
  8. sglang/lang/backend/runtime_endpoint.py +5 -1
  9. sglang/lang/backend/vertexai.py +0 -1
  10. sglang/lang/compiler.py +1 -7
  11. sglang/lang/tracer.py +3 -7
  12. sglang/srt/_custom_ops.py +0 -2
  13. sglang/srt/configs/model_config.py +4 -1
  14. sglang/srt/constrained/outlines_jump_forward.py +14 -1
  15. sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
  16. sglang/srt/constrained/xgrammar_backend.py +27 -4
  17. sglang/srt/custom_op.py +0 -62
  18. sglang/srt/disaggregation/decode.py +105 -6
  19. sglang/srt/disaggregation/mini_lb.py +74 -9
  20. sglang/srt/disaggregation/mooncake/conn.py +33 -63
  21. sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
  22. sglang/srt/disaggregation/nixl/__init__.py +1 -0
  23. sglang/srt/disaggregation/nixl/conn.py +622 -0
  24. sglang/srt/disaggregation/prefill.py +137 -17
  25. sglang/srt/disaggregation/utils.py +32 -0
  26. sglang/srt/entrypoints/engine.py +4 -0
  27. sglang/srt/entrypoints/http_server.py +3 -7
  28. sglang/srt/entrypoints/verl_engine.py +7 -5
  29. sglang/srt/function_call_parser.py +60 -0
  30. sglang/srt/layers/activation.py +6 -8
  31. sglang/srt/layers/attention/flashattention_backend.py +883 -209
  32. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  33. sglang/srt/layers/attention/torch_native_backend.py +6 -1
  34. sglang/srt/layers/attention/triton_backend.py +6 -0
  35. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
  36. sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
  37. sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
  38. sglang/srt/layers/dp_attention.py +1 -1
  39. sglang/srt/layers/layernorm.py +20 -5
  40. sglang/srt/layers/linear.py +17 -3
  41. sglang/srt/layers/moe/ep_moe/layer.py +17 -29
  42. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  43. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
  44. sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
  45. sglang/srt/layers/moe/topk.py +27 -30
  46. sglang/srt/layers/parameter.py +0 -2
  47. sglang/srt/layers/quantization/__init__.py +1 -0
  48. sglang/srt/layers/quantization/blockwise_int8.py +2 -0
  49. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
  50. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
  51. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  52. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
  53. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
  54. sglang/srt/layers/quantization/deep_gemm.py +378 -0
  55. sglang/srt/layers/quantization/fp8.py +115 -132
  56. sglang/srt/layers/quantization/fp8_kernel.py +213 -88
  57. sglang/srt/layers/quantization/fp8_utils.py +189 -264
  58. sglang/srt/layers/quantization/gptq.py +13 -7
  59. sglang/srt/layers/quantization/modelopt_quant.py +2 -2
  60. sglang/srt/layers/quantization/moe_wna16.py +2 -0
  61. sglang/srt/layers/quantization/utils.py +5 -11
  62. sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
  63. sglang/srt/layers/quantization/w8a8_int8.py +7 -7
  64. sglang/srt/layers/radix_attention.py +15 -0
  65. sglang/srt/layers/rotary_embedding.py +9 -8
  66. sglang/srt/layers/sampler.py +7 -12
  67. sglang/srt/lora/backend/base_backend.py +18 -2
  68. sglang/srt/lora/backend/flashinfer_backend.py +1 -1
  69. sglang/srt/lora/backend/triton_backend.py +1 -1
  70. sglang/srt/lora/layers.py +1 -1
  71. sglang/srt/lora/lora.py +1 -1
  72. sglang/srt/lora/lora_manager.py +1 -1
  73. sglang/srt/managers/data_parallel_controller.py +7 -1
  74. sglang/srt/managers/detokenizer_manager.py +0 -1
  75. sglang/srt/managers/io_struct.py +15 -3
  76. sglang/srt/managers/mm_utils.py +4 -3
  77. sglang/srt/managers/multimodal_processor.py +0 -2
  78. sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
  79. sglang/srt/managers/schedule_batch.py +15 -4
  80. sglang/srt/managers/scheduler.py +28 -77
  81. sglang/srt/managers/tokenizer_manager.py +116 -29
  82. sglang/srt/managers/tp_worker.py +1 -0
  83. sglang/srt/mem_cache/hiradix_cache.py +41 -29
  84. sglang/srt/mem_cache/memory_pool.py +38 -15
  85. sglang/srt/model_executor/cuda_graph_runner.py +15 -10
  86. sglang/srt/model_executor/model_runner.py +39 -31
  87. sglang/srt/models/bert.py +398 -0
  88. sglang/srt/models/deepseek.py +1 -1
  89. sglang/srt/models/deepseek_nextn.py +74 -70
  90. sglang/srt/models/deepseek_v2.py +292 -348
  91. sglang/srt/models/llama.py +5 -5
  92. sglang/srt/models/minicpm3.py +31 -203
  93. sglang/srt/models/minicpmo.py +17 -6
  94. sglang/srt/models/qwen2.py +4 -1
  95. sglang/srt/models/qwen2_moe.py +14 -13
  96. sglang/srt/models/qwen3.py +335 -0
  97. sglang/srt/models/qwen3_moe.py +423 -0
  98. sglang/srt/openai_api/adapter.py +71 -4
  99. sglang/srt/openai_api/protocol.py +6 -1
  100. sglang/srt/reasoning_parser.py +0 -1
  101. sglang/srt/sampling/sampling_batch_info.py +2 -3
  102. sglang/srt/server_args.py +86 -72
  103. sglang/srt/speculative/build_eagle_tree.py +2 -2
  104. sglang/srt/speculative/eagle_utils.py +2 -2
  105. sglang/srt/speculative/eagle_worker.py +6 -14
  106. sglang/srt/utils.py +62 -6
  107. sglang/test/runners.py +5 -1
  108. sglang/test/test_block_fp8.py +167 -0
  109. sglang/test/test_custom_ops.py +1 -1
  110. sglang/test/test_utils.py +3 -1
  111. sglang/version.py +1 -1
  112. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
  113. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
  114. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
  115. sglang/lang/__init__.py +0 -0
  116. sglang/srt/lora/backend/__init__.py +0 -25
  117. sglang/srt/server.py +0 -18
  118. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
  119. {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ import torch
7
7
  from sglang.srt.layers.activation import SiluAndMul
8
8
  from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
9
9
  from sglang.srt.layers.quantization.fp8_kernel import (
10
+ per_tensor_quant_mla_deep_gemm_masked_fp8,
10
11
  per_tensor_quant_mla_fp8,
11
12
  per_token_group_quant_fp8,
12
13
  static_quant_fp8,
@@ -212,6 +213,62 @@ class TestPerTensorQuantMlaFP8(CustomTestCase):
212
213
  self._per_tensor_quant_mla_fp8(*params)
213
214
 
214
215
 
216
+ class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
217
+ DTYPES = [torch.half, torch.bfloat16, torch.float32]
218
+ B = [128]
219
+ NUM_TOKENS = [7, 83, 2048, 1024 * 16]
220
+ D = [512, 128]
221
+ GROUP_SIZE = [128]
222
+ SEEDS = [0]
223
+
224
+ @classmethod
225
+ def setUpClass(cls):
226
+ if not torch.cuda.is_available():
227
+ raise unittest.SkipTest("CUDA is not available")
228
+ torch.set_default_device("cuda")
229
+
230
+ def _per_token_group_quant_mla_deep_gemm_masked_fp8(
231
+ self, b, num_tokens, d, dtype, group_size, seed
232
+ ):
233
+ torch.manual_seed(seed)
234
+
235
+ x = torch.rand(b, num_tokens, d, dtype=dtype)
236
+
237
+ with torch.inference_mode():
238
+ ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
239
+ out, scale, _, _, _ = per_tensor_quant_mla_deep_gemm_masked_fp8(
240
+ x, group_size
241
+ )
242
+ out = out[:, :num_tokens, :]
243
+ scale = scale[:, :num_tokens, :]
244
+
245
+ self.assertTrue(
246
+ torch.allclose(
247
+ out.to(torch.float32), ref_out.to(torch.float32), rtol=0.20, atol=1e-2
248
+ )
249
+ )
250
+ self.assertTrue(torch.allclose(scale, ref_scale))
251
+
252
+ def test_per_token_group_quant_mla_deep_gemm_masked_fp8(self):
253
+ for params in itertools.product(
254
+ self.B,
255
+ self.NUM_TOKENS,
256
+ self.D,
257
+ self.DTYPES,
258
+ self.GROUP_SIZE,
259
+ self.SEEDS,
260
+ ):
261
+ with self.subTest(
262
+ b=params[0],
263
+ num_tokens=params[1],
264
+ d=params[2],
265
+ dtype=params[3],
266
+ group_size=params[4],
267
+ seed=params[5],
268
+ ):
269
+ self._per_token_group_quant_mla_deep_gemm_masked_fp8(*params)
270
+
271
+
215
272
  # For test
216
273
  def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
217
274
  """This function performs matrix multiplication with block-wise quantization using native torch.
@@ -485,5 +542,115 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
485
542
  self._w8a8_block_fp8_fused_moe(*params)
486
543
 
487
544
 
545
+ # For test
546
+ def torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_shape, out_dtype):
547
+ """This function performs bmm with block-wise quantization using native torch."""
548
+
549
+ B, N, _ = w.shape
550
+ _, M, _ = a.shape
551
+ out = torch.empty((B, M, N), dtype=out_dtype, device=a.device)
552
+
553
+ for i in range(B):
554
+ out[i] = native_w8a8_block_fp8_matmul(
555
+ a[i], w[i], a_s[i], w_s[i], block_shape, output_dtype=out_dtype
556
+ )
557
+
558
+ return out
559
+
560
+
561
+ class TestW8A8BlockFP8BatchedDeepGemm(CustomTestCase):
562
+ DTYPES = [torch.bfloat16]
563
+ M = [1, 33, 64, 222, 8192]
564
+ N = [128, 512]
565
+ K = [128, 512]
566
+ BATCH = [128]
567
+ BLOCK_SIZE = [[128, 128]]
568
+ SEEDS = [0]
569
+
570
+ @classmethod
571
+ def setUpClass(cls):
572
+ if not torch.cuda.is_available():
573
+ raise unittest.SkipTest("CUDA is not available")
574
+ try:
575
+ import deep_gemm
576
+ except ImportError:
577
+ raise unittest.SkipTest("DeepGEMM is not available")
578
+ torch.set_default_device("cuda")
579
+
580
+ def _w8a8_block_fp8_batched_deep_gemm(self, M, N, K, B, block_size, dtype, seed):
581
+ torch.manual_seed(seed)
582
+ factor_for_scale = 1e-2
583
+ fp8_info = torch.finfo(torch.float8_e4m3fn)
584
+ fp8_max, fp8_min = fp8_info.max, fp8_info.min
585
+
586
+ a_fp32 = torch.randn((B, M, K), dtype=torch.float32) / 10
587
+ a = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
588
+
589
+ w_fp32 = (torch.rand((B, N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
590
+ w = w_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
591
+
592
+ block_n, block_k = block_size[0], block_size[1]
593
+ n_tiles_w = (N + block_n - 1) // block_n
594
+ k_tiles_w = (K + block_k - 1) // block_k
595
+
596
+ w_s = (
597
+ torch.rand((B, n_tiles_w, k_tiles_w), dtype=torch.float32)
598
+ * factor_for_scale
599
+ )
600
+ a_s = torch.rand((B, M, k_tiles_w), dtype=torch.float32) * factor_for_scale
601
+
602
+ ae = a.new_empty(B, (M + 255) // 256 * 256, K)
603
+ ae_s = a_s.new_empty(B, (M + 255) // 256 * 256, k_tiles_w)
604
+ oe = torch.empty((B, (M + 255) // 256 * 256, N), dtype=dtype)
605
+ ae[:, :M, :] = a
606
+ ae_s[:, :M, :] = a_s
607
+
608
+ masked_m = torch.full((B,), M, dtype=torch.int)
609
+ expected_m = M
610
+ lhs = (
611
+ ae,
612
+ ae_s,
613
+ )
614
+ rhs = (
615
+ w,
616
+ w_s,
617
+ )
618
+
619
+ from deep_gemm import m_grouped_gemm_fp8_fp8_bf16_nt_masked
620
+
621
+ with torch.inference_mode():
622
+ ref_out = torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_size, dtype)
623
+ m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs, rhs, oe, masked_m, expected_m)
624
+ out = oe[:, :M, :]
625
+
626
+ self.assertTrue(
627
+ torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
628
+ / torch.mean(torch.abs(ref_out.to(torch.float32)))
629
+ < 0.0001
630
+ )
631
+
632
+ def test_w8a8_block_fp8_batched_deep_gemm(self):
633
+
634
+ for params in itertools.product(
635
+ self.M,
636
+ self.N,
637
+ self.K,
638
+ self.BATCH,
639
+ self.BLOCK_SIZE,
640
+ self.DTYPES,
641
+ self.SEEDS,
642
+ ):
643
+ with self.subTest(
644
+ M=params[0],
645
+ N=params[1],
646
+ K=params[2],
647
+ B=params[3],
648
+ block_size=params[4],
649
+ dtype=params[5],
650
+ seed=params[6],
651
+ ):
652
+ self._w8a8_block_fp8_batched_deep_gemm(*params)
653
+
654
+
488
655
  if __name__ == "__main__":
489
656
  unittest.main(verbosity=2)
@@ -3,7 +3,7 @@
3
3
  import pytest
4
4
  import torch
5
5
 
6
- from sglang.srt.custom_op import scaled_fp8_quant
6
+ from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
7
7
  from sglang.srt.utils import is_cuda
8
8
 
9
9
 
sglang/test/test_utils.py CHANGED
@@ -450,7 +450,9 @@ def popen_launch_server(
450
450
 
451
451
  return_code = process.poll()
452
452
  if return_code is not None:
453
- raise Exception(f"Server unexpectedly exits ({return_code=}).")
453
+ raise Exception(
454
+ f"Server unexpectedly exits ({return_code=}). Usually there will be error logs describing the cause far above this line."
455
+ )
454
456
 
455
457
  time.sleep(10)
456
458
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.5.post1"
1
+ __version__ = "0.4.5.post3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.4.5.post1
3
+ Version: 0.4.5.post3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -246,10 +246,10 @@ Requires-Dist: compressed-tensors; extra == "runtime-common"
246
246
  Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
247
247
  Provides-Extra: srt
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt"
249
- Requires-Dist: sgl-kernel==0.0.9.post1; extra == "srt"
249
+ Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
250
250
  Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
251
- Requires-Dist: torch==2.5.1; extra == "srt"
252
- Requires-Dist: torchvision==0.20.1; extra == "srt"
251
+ Requires-Dist: torch==2.6.0; extra == "srt"
252
+ Requires-Dist: torchvision==0.21.0; extra == "srt"
253
253
  Requires-Dist: cuda-python; extra == "srt"
254
254
  Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
255
255
  Requires-Dist: partial_json_parser; extra == "srt"
@@ -381,7 +381,7 @@ SGLang is a fast serving framework for large language models and vision language
381
381
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
382
382
  The core features include:
383
383
 
384
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
384
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
385
385
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
386
386
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
387
387
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.