sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -4
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +3 -6
- sglang/compile_deep_gemm.py +136 -0
- sglang/lang/backend/anthropic.py +0 -4
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/openai.py +6 -2
- sglang/lang/backend/runtime_endpoint.py +5 -1
- sglang/lang/backend/vertexai.py +0 -1
- sglang/lang/compiler.py +1 -7
- sglang/lang/tracer.py +3 -7
- sglang/srt/_custom_ops.py +0 -2
- sglang/srt/configs/model_config.py +4 -1
- sglang/srt/constrained/outlines_jump_forward.py +14 -1
- sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
- sglang/srt/constrained/xgrammar_backend.py +27 -4
- sglang/srt/custom_op.py +0 -62
- sglang/srt/disaggregation/decode.py +105 -6
- sglang/srt/disaggregation/mini_lb.py +74 -9
- sglang/srt/disaggregation/mooncake/conn.py +33 -63
- sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
- sglang/srt/disaggregation/nixl/__init__.py +1 -0
- sglang/srt/disaggregation/nixl/conn.py +622 -0
- sglang/srt/disaggregation/prefill.py +137 -17
- sglang/srt/disaggregation/utils.py +32 -0
- sglang/srt/entrypoints/engine.py +4 -0
- sglang/srt/entrypoints/http_server.py +3 -7
- sglang/srt/entrypoints/verl_engine.py +7 -5
- sglang/srt/function_call_parser.py +60 -0
- sglang/srt/layers/activation.py +6 -8
- sglang/srt/layers/attention/flashattention_backend.py +883 -209
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/torch_native_backend.py +6 -1
- sglang/srt/layers/attention/triton_backend.py +6 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +18 -7
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
- sglang/srt/layers/dp_attention.py +1 -1
- sglang/srt/layers/layernorm.py +20 -5
- sglang/srt/layers/linear.py +17 -3
- sglang/srt/layers/moe/ep_moe/layer.py +17 -29
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
- sglang/srt/layers/moe/topk.py +27 -30
- sglang/srt/layers/parameter.py +0 -2
- sglang/srt/layers/quantization/__init__.py +1 -0
- sglang/srt/layers/quantization/blockwise_int8.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +9 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
- sglang/srt/layers/quantization/deep_gemm.py +378 -0
- sglang/srt/layers/quantization/fp8.py +115 -132
- sglang/srt/layers/quantization/fp8_kernel.py +213 -88
- sglang/srt/layers/quantization/fp8_utils.py +189 -264
- sglang/srt/layers/quantization/gptq.py +13 -7
- sglang/srt/layers/quantization/modelopt_quant.py +2 -2
- sglang/srt/layers/quantization/moe_wna16.py +2 -0
- sglang/srt/layers/quantization/utils.py +5 -11
- sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -7
- sglang/srt/layers/radix_attention.py +15 -0
- sglang/srt/layers/rotary_embedding.py +9 -8
- sglang/srt/layers/sampler.py +7 -12
- sglang/srt/lora/backend/base_backend.py +18 -2
- sglang/srt/lora/backend/flashinfer_backend.py +1 -1
- sglang/srt/lora/backend/triton_backend.py +1 -1
- sglang/srt/lora/layers.py +1 -1
- sglang/srt/lora/lora.py +1 -1
- sglang/srt/lora/lora_manager.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +7 -1
- sglang/srt/managers/detokenizer_manager.py +0 -1
- sglang/srt/managers/io_struct.py +15 -3
- sglang/srt/managers/mm_utils.py +4 -3
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
- sglang/srt/managers/schedule_batch.py +15 -4
- sglang/srt/managers/scheduler.py +28 -77
- sglang/srt/managers/tokenizer_manager.py +116 -29
- sglang/srt/managers/tp_worker.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +41 -29
- sglang/srt/mem_cache/memory_pool.py +38 -15
- sglang/srt/model_executor/cuda_graph_runner.py +15 -10
- sglang/srt/model_executor/model_runner.py +39 -31
- sglang/srt/models/bert.py +398 -0
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_nextn.py +74 -70
- sglang/srt/models/deepseek_v2.py +292 -348
- sglang/srt/models/llama.py +5 -5
- sglang/srt/models/minicpm3.py +31 -203
- sglang/srt/models/minicpmo.py +17 -6
- sglang/srt/models/qwen2.py +4 -1
- sglang/srt/models/qwen2_moe.py +14 -13
- sglang/srt/models/qwen3.py +335 -0
- sglang/srt/models/qwen3_moe.py +423 -0
- sglang/srt/openai_api/adapter.py +71 -4
- sglang/srt/openai_api/protocol.py +6 -1
- sglang/srt/reasoning_parser.py +0 -1
- sglang/srt/sampling/sampling_batch_info.py +2 -3
- sglang/srt/server_args.py +86 -72
- sglang/srt/speculative/build_eagle_tree.py +2 -2
- sglang/srt/speculative/eagle_utils.py +2 -2
- sglang/srt/speculative/eagle_worker.py +6 -14
- sglang/srt/utils.py +62 -6
- sglang/test/runners.py +5 -1
- sglang/test/test_block_fp8.py +167 -0
- sglang/test/test_custom_ops.py +1 -1
- sglang/test/test_utils.py +3 -1
- sglang/version.py +1 -1
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +5 -5
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +116 -110
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +1 -1
- sglang/lang/__init__.py +0 -0
- sglang/srt/lora/backend/__init__.py +0 -25
- sglang/srt/server.py +0 -18
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0
sglang/test/test_block_fp8.py
CHANGED
@@ -7,6 +7,7 @@ import torch
|
|
7
7
|
from sglang.srt.layers.activation import SiluAndMul
|
8
8
|
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
|
9
9
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
10
|
+
per_tensor_quant_mla_deep_gemm_masked_fp8,
|
10
11
|
per_tensor_quant_mla_fp8,
|
11
12
|
per_token_group_quant_fp8,
|
12
13
|
static_quant_fp8,
|
@@ -212,6 +213,62 @@ class TestPerTensorQuantMlaFP8(CustomTestCase):
|
|
212
213
|
self._per_tensor_quant_mla_fp8(*params)
|
213
214
|
|
214
215
|
|
216
|
+
class TestPerTokenGroupQuantMlaDeepGemmMaskedFP8(CustomTestCase):
|
217
|
+
DTYPES = [torch.half, torch.bfloat16, torch.float32]
|
218
|
+
B = [128]
|
219
|
+
NUM_TOKENS = [7, 83, 2048, 1024 * 16]
|
220
|
+
D = [512, 128]
|
221
|
+
GROUP_SIZE = [128]
|
222
|
+
SEEDS = [0]
|
223
|
+
|
224
|
+
@classmethod
|
225
|
+
def setUpClass(cls):
|
226
|
+
if not torch.cuda.is_available():
|
227
|
+
raise unittest.SkipTest("CUDA is not available")
|
228
|
+
torch.set_default_device("cuda")
|
229
|
+
|
230
|
+
def _per_token_group_quant_mla_deep_gemm_masked_fp8(
|
231
|
+
self, b, num_tokens, d, dtype, group_size, seed
|
232
|
+
):
|
233
|
+
torch.manual_seed(seed)
|
234
|
+
|
235
|
+
x = torch.rand(b, num_tokens, d, dtype=dtype)
|
236
|
+
|
237
|
+
with torch.inference_mode():
|
238
|
+
ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size, 1e-12)
|
239
|
+
out, scale, _, _, _ = per_tensor_quant_mla_deep_gemm_masked_fp8(
|
240
|
+
x, group_size
|
241
|
+
)
|
242
|
+
out = out[:, :num_tokens, :]
|
243
|
+
scale = scale[:, :num_tokens, :]
|
244
|
+
|
245
|
+
self.assertTrue(
|
246
|
+
torch.allclose(
|
247
|
+
out.to(torch.float32), ref_out.to(torch.float32), rtol=0.20, atol=1e-2
|
248
|
+
)
|
249
|
+
)
|
250
|
+
self.assertTrue(torch.allclose(scale, ref_scale))
|
251
|
+
|
252
|
+
def test_per_token_group_quant_mla_deep_gemm_masked_fp8(self):
|
253
|
+
for params in itertools.product(
|
254
|
+
self.B,
|
255
|
+
self.NUM_TOKENS,
|
256
|
+
self.D,
|
257
|
+
self.DTYPES,
|
258
|
+
self.GROUP_SIZE,
|
259
|
+
self.SEEDS,
|
260
|
+
):
|
261
|
+
with self.subTest(
|
262
|
+
b=params[0],
|
263
|
+
num_tokens=params[1],
|
264
|
+
d=params[2],
|
265
|
+
dtype=params[3],
|
266
|
+
group_size=params[4],
|
267
|
+
seed=params[5],
|
268
|
+
):
|
269
|
+
self._per_token_group_quant_mla_deep_gemm_masked_fp8(*params)
|
270
|
+
|
271
|
+
|
215
272
|
# For test
|
216
273
|
def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
|
217
274
|
"""This function performs matrix multiplication with block-wise quantization using native torch.
|
@@ -485,5 +542,115 @@ class TestW8A8BlockFP8FusedMoE(CustomTestCase):
|
|
485
542
|
self._w8a8_block_fp8_fused_moe(*params)
|
486
543
|
|
487
544
|
|
545
|
+
# For test
|
546
|
+
def torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_shape, out_dtype):
|
547
|
+
"""This function performs bmm with block-wise quantization using native torch."""
|
548
|
+
|
549
|
+
B, N, _ = w.shape
|
550
|
+
_, M, _ = a.shape
|
551
|
+
out = torch.empty((B, M, N), dtype=out_dtype, device=a.device)
|
552
|
+
|
553
|
+
for i in range(B):
|
554
|
+
out[i] = native_w8a8_block_fp8_matmul(
|
555
|
+
a[i], w[i], a_s[i], w_s[i], block_shape, output_dtype=out_dtype
|
556
|
+
)
|
557
|
+
|
558
|
+
return out
|
559
|
+
|
560
|
+
|
561
|
+
class TestW8A8BlockFP8BatchedDeepGemm(CustomTestCase):
|
562
|
+
DTYPES = [torch.bfloat16]
|
563
|
+
M = [1, 33, 64, 222, 8192]
|
564
|
+
N = [128, 512]
|
565
|
+
K = [128, 512]
|
566
|
+
BATCH = [128]
|
567
|
+
BLOCK_SIZE = [[128, 128]]
|
568
|
+
SEEDS = [0]
|
569
|
+
|
570
|
+
@classmethod
|
571
|
+
def setUpClass(cls):
|
572
|
+
if not torch.cuda.is_available():
|
573
|
+
raise unittest.SkipTest("CUDA is not available")
|
574
|
+
try:
|
575
|
+
import deep_gemm
|
576
|
+
except ImportError:
|
577
|
+
raise unittest.SkipTest("DeepGEMM is not available")
|
578
|
+
torch.set_default_device("cuda")
|
579
|
+
|
580
|
+
def _w8a8_block_fp8_batched_deep_gemm(self, M, N, K, B, block_size, dtype, seed):
|
581
|
+
torch.manual_seed(seed)
|
582
|
+
factor_for_scale = 1e-2
|
583
|
+
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
584
|
+
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
585
|
+
|
586
|
+
a_fp32 = torch.randn((B, M, K), dtype=torch.float32) / 10
|
587
|
+
a = a_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
588
|
+
|
589
|
+
w_fp32 = (torch.rand((B, N, K), dtype=torch.float32) - 0.5) * 2 * fp8_max
|
590
|
+
w = w_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
591
|
+
|
592
|
+
block_n, block_k = block_size[0], block_size[1]
|
593
|
+
n_tiles_w = (N + block_n - 1) // block_n
|
594
|
+
k_tiles_w = (K + block_k - 1) // block_k
|
595
|
+
|
596
|
+
w_s = (
|
597
|
+
torch.rand((B, n_tiles_w, k_tiles_w), dtype=torch.float32)
|
598
|
+
* factor_for_scale
|
599
|
+
)
|
600
|
+
a_s = torch.rand((B, M, k_tiles_w), dtype=torch.float32) * factor_for_scale
|
601
|
+
|
602
|
+
ae = a.new_empty(B, (M + 255) // 256 * 256, K)
|
603
|
+
ae_s = a_s.new_empty(B, (M + 255) // 256 * 256, k_tiles_w)
|
604
|
+
oe = torch.empty((B, (M + 255) // 256 * 256, N), dtype=dtype)
|
605
|
+
ae[:, :M, :] = a
|
606
|
+
ae_s[:, :M, :] = a_s
|
607
|
+
|
608
|
+
masked_m = torch.full((B,), M, dtype=torch.int)
|
609
|
+
expected_m = M
|
610
|
+
lhs = (
|
611
|
+
ae,
|
612
|
+
ae_s,
|
613
|
+
)
|
614
|
+
rhs = (
|
615
|
+
w,
|
616
|
+
w_s,
|
617
|
+
)
|
618
|
+
|
619
|
+
from deep_gemm import m_grouped_gemm_fp8_fp8_bf16_nt_masked
|
620
|
+
|
621
|
+
with torch.inference_mode():
|
622
|
+
ref_out = torch_w8a8_block_fp8_bmm(a, a_s, w, w_s, block_size, dtype)
|
623
|
+
m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs, rhs, oe, masked_m, expected_m)
|
624
|
+
out = oe[:, :M, :]
|
625
|
+
|
626
|
+
self.assertTrue(
|
627
|
+
torch.mean(torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)))
|
628
|
+
/ torch.mean(torch.abs(ref_out.to(torch.float32)))
|
629
|
+
< 0.0001
|
630
|
+
)
|
631
|
+
|
632
|
+
def test_w8a8_block_fp8_batched_deep_gemm(self):
|
633
|
+
|
634
|
+
for params in itertools.product(
|
635
|
+
self.M,
|
636
|
+
self.N,
|
637
|
+
self.K,
|
638
|
+
self.BATCH,
|
639
|
+
self.BLOCK_SIZE,
|
640
|
+
self.DTYPES,
|
641
|
+
self.SEEDS,
|
642
|
+
):
|
643
|
+
with self.subTest(
|
644
|
+
M=params[0],
|
645
|
+
N=params[1],
|
646
|
+
K=params[2],
|
647
|
+
B=params[3],
|
648
|
+
block_size=params[4],
|
649
|
+
dtype=params[5],
|
650
|
+
seed=params[6],
|
651
|
+
):
|
652
|
+
self._w8a8_block_fp8_batched_deep_gemm(*params)
|
653
|
+
|
654
|
+
|
488
655
|
if __name__ == "__main__":
|
489
656
|
unittest.main(verbosity=2)
|
sglang/test/test_custom_ops.py
CHANGED
sglang/test/test_utils.py
CHANGED
@@ -450,7 +450,9 @@ def popen_launch_server(
|
|
450
450
|
|
451
451
|
return_code = process.poll()
|
452
452
|
if return_code is not None:
|
453
|
-
raise Exception(
|
453
|
+
raise Exception(
|
454
|
+
f"Server unexpectedly exits ({return_code=}). Usually there will be error logs describing the cause far above this line."
|
455
|
+
)
|
454
456
|
|
455
457
|
time.sleep(10)
|
456
458
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.5.
|
1
|
+
__version__ = "0.4.5.post3"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.5.
|
3
|
+
Version: 0.4.5.post3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -246,10 +246,10 @@ Requires-Dist: compressed-tensors; extra == "runtime-common"
|
|
246
246
|
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
247
247
|
Provides-Extra: srt
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
249
|
-
Requires-Dist: sgl-kernel==0.0.9.
|
249
|
+
Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
|
250
250
|
Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
|
251
|
-
Requires-Dist: torch==2.
|
252
|
-
Requires-Dist: torchvision==0.
|
251
|
+
Requires-Dist: torch==2.6.0; extra == "srt"
|
252
|
+
Requires-Dist: torchvision==0.21.0; extra == "srt"
|
253
253
|
Requires-Dist: cuda-python; extra == "srt"
|
254
254
|
Requires-Dist: outlines<=0.1.11,>=0.0.44; extra == "srt"
|
255
255
|
Requires-Dist: partial_json_parser; extra == "srt"
|
@@ -381,7 +381,7 @@ SGLang is a fast serving framework for large language models and vision language
|
|
381
381
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
382
382
|
The core features include:
|
383
383
|
|
384
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs,
|
384
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
|
385
385
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
386
386
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
387
387
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|