sglang 0.4.5.post1__py3-none-any.whl → 0.4.5.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -4
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +0 -4
- sglang/lang/backend/anthropic.py +0 -4
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/openai.py +1 -1
- sglang/lang/backend/vertexai.py +0 -1
- sglang/lang/compiler.py +1 -7
- sglang/lang/tracer.py +3 -7
- sglang/srt/_custom_ops.py +0 -2
- sglang/srt/constrained/outlines_jump_forward.py +14 -1
- sglang/srt/constrained/triton_ops/bitmask_ops.py +141 -0
- sglang/srt/constrained/xgrammar_backend.py +26 -4
- sglang/srt/custom_op.py +0 -62
- sglang/srt/disaggregation/decode.py +62 -6
- sglang/srt/disaggregation/mini_lb.py +5 -1
- sglang/srt/disaggregation/mooncake/conn.py +32 -62
- sglang/srt/disaggregation/mooncake/transfer_engine.py +30 -61
- sglang/srt/disaggregation/prefill.py +40 -4
- sglang/srt/disaggregation/utils.py +15 -0
- sglang/srt/entrypoints/verl_engine.py +7 -5
- sglang/srt/layers/activation.py +6 -8
- sglang/srt/layers/attention/flashattention_backend.py +114 -71
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/torch_native_backend.py +6 -1
- sglang/srt/layers/attention/triton_backend.py +6 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +13 -2
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +17 -3
- sglang/srt/layers/moe/ep_moe/layer.py +15 -29
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +14 -19
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
- sglang/srt/layers/moe/topk.py +27 -30
- sglang/srt/layers/parameter.py +0 -2
- sglang/srt/layers/quantization/__init__.py +1 -0
- sglang/srt/layers/quantization/blockwise_int8.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +8 -2
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +16 -44
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +4 -7
- sglang/srt/layers/quantization/fp8.py +115 -132
- sglang/srt/layers/quantization/fp8_kernel.py +213 -57
- sglang/srt/layers/quantization/fp8_utils.py +187 -262
- sglang/srt/layers/quantization/moe_wna16.py +2 -0
- sglang/srt/layers/quantization/utils.py +5 -11
- sglang/srt/layers/quantization/w8a8_fp8.py +2 -0
- sglang/srt/layers/quantization/w8a8_int8.py +7 -7
- sglang/srt/layers/radix_attention.py +15 -0
- sglang/srt/layers/rotary_embedding.py +3 -2
- sglang/srt/layers/sampler.py +5 -10
- sglang/srt/lora/backend/base_backend.py +18 -2
- sglang/srt/lora/backend/flashinfer_backend.py +1 -1
- sglang/srt/lora/backend/triton_backend.py +1 -1
- sglang/srt/lora/layers.py +1 -1
- sglang/srt/lora/lora.py +1 -1
- sglang/srt/lora/lora_manager.py +1 -1
- sglang/srt/managers/detokenizer_manager.py +0 -1
- sglang/srt/managers/io_struct.py +1 -0
- sglang/srt/managers/mm_utils.py +4 -3
- sglang/srt/managers/multimodal_processor.py +0 -2
- sglang/srt/managers/multimodal_processors/base_processor.py +3 -2
- sglang/srt/managers/schedule_batch.py +2 -4
- sglang/srt/managers/scheduler.py +12 -71
- sglang/srt/managers/tokenizer_manager.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +5 -1
- sglang/srt/mem_cache/memory_pool.py +7 -2
- sglang/srt/model_executor/cuda_graph_runner.py +2 -2
- sglang/srt/model_executor/model_runner.py +20 -27
- sglang/srt/models/bert.py +398 -0
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_nextn.py +74 -70
- sglang/srt/models/deepseek_v2.py +289 -348
- sglang/srt/models/llama.py +5 -5
- sglang/srt/models/minicpm3.py +29 -201
- sglang/srt/models/qwen2.py +4 -1
- sglang/srt/models/qwen2_moe.py +14 -13
- sglang/srt/models/qwen3.py +335 -0
- sglang/srt/models/qwen3_moe.py +423 -0
- sglang/srt/reasoning_parser.py +0 -1
- sglang/srt/sampling/sampling_batch_info.py +2 -3
- sglang/srt/server_args.py +34 -32
- sglang/srt/speculative/eagle_worker.py +4 -7
- sglang/srt/utils.py +16 -1
- sglang/test/runners.py +5 -1
- sglang/test/test_block_fp8.py +167 -0
- sglang/test/test_custom_ops.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/METADATA +3 -3
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/RECORD +92 -91
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/WHEEL +1 -1
- sglang/lang/__init__.py +0 -0
- sglang/srt/lora/backend/__init__.py +0 -25
- sglang/srt/server.py +0 -18
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.5.post1.dist-info → sglang-0.4.5.post2.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -24,6 +24,7 @@ from sglang.api import (
|
|
24
24
|
user_end,
|
25
25
|
video,
|
26
26
|
)
|
27
|
+
from sglang.global_config import global_config
|
27
28
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
28
29
|
from sglang.lang.choices import (
|
29
30
|
greedy_token_selection,
|
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
|
|
31
32
|
unconditional_likelihood_normalized,
|
32
33
|
)
|
33
34
|
from sglang.utils import LazyImport
|
35
|
+
from sglang.version import __version__
|
34
36
|
|
35
37
|
ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
|
36
38
|
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
@@ -38,10 +40,6 @@ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
|
38
40
|
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
39
41
|
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
40
42
|
|
41
|
-
# Other configs
|
42
|
-
from sglang.global_config import global_config
|
43
|
-
from sglang.version import __version__
|
44
|
-
|
45
43
|
__all__ = [
|
46
44
|
"Engine",
|
47
45
|
"Runtime",
|
sglang/bench_one_batch.py
CHANGED
@@ -207,7 +207,7 @@ def prepare_extend_inputs_for_correctness_test(
|
|
207
207
|
|
208
208
|
|
209
209
|
def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
210
|
-
input_ids = np.
|
210
|
+
input_ids = np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
|
211
211
|
sampling_params = SamplingParams(
|
212
212
|
temperature=0,
|
213
213
|
max_new_tokens=BenchArgs.output_len,
|
@@ -396,7 +396,7 @@ def latency_test_run_once(
|
|
396
396
|
decode_latencies.append(latency)
|
397
397
|
if i < 5:
|
398
398
|
rank_print(
|
399
|
-
f"Decode.
|
399
|
+
f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
400
400
|
)
|
401
401
|
|
402
402
|
if profile:
|
sglang/bench_serving.py
CHANGED
@@ -707,10 +707,6 @@ def sample_random_requests(
|
|
707
707
|
|
708
708
|
# Download sharegpt if necessary
|
709
709
|
if not os.path.isfile(dataset_path):
|
710
|
-
print(
|
711
|
-
"If you do not want to randomly sample from a dataset,"
|
712
|
-
" please use --dataset-name random-ids."
|
713
|
-
)
|
714
710
|
dataset_path = download_and_cache_file(SHAREGPT_URL)
|
715
711
|
|
716
712
|
# Load the dataset.
|
sglang/lang/backend/anthropic.py
CHANGED
sglang/lang/backend/openai.py
CHANGED
sglang/lang/backend/vertexai.py
CHANGED
sglang/lang/compiler.py
CHANGED
@@ -5,13 +5,7 @@ from typing import List, Union
|
|
5
5
|
|
6
6
|
from sglang.global_config import global_config
|
7
7
|
from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
|
8
|
-
from sglang.lang.ir import
|
9
|
-
SglArgument,
|
10
|
-
SglConstantText,
|
11
|
-
SglExpr,
|
12
|
-
SglSamplingParams,
|
13
|
-
SglVariable,
|
14
|
-
)
|
8
|
+
from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable
|
15
9
|
|
16
10
|
|
17
11
|
def compile_func(function, backend):
|
sglang/lang/tracer.py
CHANGED
@@ -1,20 +1,16 @@
|
|
1
1
|
"""Tracing a program."""
|
2
2
|
|
3
3
|
import uuid
|
4
|
-
from typing import Any,
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
5
|
|
6
|
-
from sglang.global_config import global_config
|
7
6
|
from sglang.lang.backend.base_backend import BaseBackend
|
8
7
|
from sglang.lang.interpreter import ProgramState, ProgramStateGroup
|
9
8
|
from sglang.lang.ir import (
|
10
9
|
SglArgument,
|
11
|
-
SglCommitLazy,
|
12
|
-
SglConcateAndAppend,
|
13
10
|
SglConstantText,
|
14
11
|
SglExpr,
|
15
12
|
SglExprList,
|
16
13
|
SglFork,
|
17
|
-
SglFunction,
|
18
14
|
SglGen,
|
19
15
|
SglGetForkItem,
|
20
16
|
SglRoleBegin,
|
@@ -230,8 +226,8 @@ class TracerProgramState(ProgramState):
|
|
230
226
|
self.cur_role = None
|
231
227
|
|
232
228
|
def _execute_var_scope_end(self, expr: SglVarScopeEnd):
|
233
|
-
new_node = SglVariable(name, source=self.last_node)
|
234
|
-
self.variables[name] = new_node
|
229
|
+
new_node = SglVariable(expr.name, source=self.last_node)
|
230
|
+
self.variables[expr.name] = new_node
|
235
231
|
|
236
232
|
def get_var(self, name):
|
237
233
|
ret = self.arguments.get(name, None)
|
sglang/srt/_custom_ops.py
CHANGED
@@ -19,10 +19,13 @@ Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
|
|
19
19
|
import dataclasses
|
20
20
|
import logging
|
21
21
|
from collections import defaultdict
|
22
|
+
from typing import Optional
|
22
23
|
|
23
24
|
import interegular
|
24
25
|
from interegular import InvalidSyntax
|
25
|
-
from outlines.caching import cache
|
26
|
+
from outlines.caching import cache
|
27
|
+
|
28
|
+
from sglang.srt.utils import get_bool_env_var
|
26
29
|
|
27
30
|
try:
|
28
31
|
# outlines >= 0.1.0
|
@@ -34,6 +37,9 @@ except ImportError:
|
|
34
37
|
|
35
38
|
IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
|
36
39
|
|
40
|
+
# Env var was set in sglang.srt.server_args.ServerArgs.__post__init__
|
41
|
+
DISABLE_DISK_CACHE = get_bool_env_var("SGLANG_DISABLE_OUTLINES_DISK_CACHE", "true")
|
42
|
+
|
37
43
|
logger = logging.getLogger(__name__)
|
38
44
|
|
39
45
|
|
@@ -45,6 +51,13 @@ class JumpEdge:
|
|
45
51
|
byte_next_state: int = None
|
46
52
|
|
47
53
|
|
54
|
+
def disk_cache(expire: Optional[float] = None, typed=False, ignore=()):
|
55
|
+
if not DISABLE_DISK_CACHE:
|
56
|
+
return cache(expire, typed, ignore)
|
57
|
+
else:
|
58
|
+
return lambda fn: None
|
59
|
+
|
60
|
+
|
48
61
|
@disk_cache()
|
49
62
|
def init_state_to_jump_forward(regex_string):
|
50
63
|
try:
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# Adapt from
|
2
|
+
# https://github.com/mlc-ai/xgrammar/blob/v0.1.17/python/xgrammar/kernels/apply_token_bitmask_inplace_triton.py
|
3
|
+
|
4
|
+
from typing import List, Optional, Union
|
5
|
+
|
6
|
+
import torch
|
7
|
+
import triton
|
8
|
+
import triton.language as tl
|
9
|
+
|
10
|
+
from sglang.srt.utils import get_device_core_count
|
11
|
+
|
12
|
+
|
13
|
+
@triton.jit
|
14
|
+
def apply_token_bitmask_inplace_kernel(
|
15
|
+
logits_ptr,
|
16
|
+
bitmask_ptr,
|
17
|
+
indices_ptr,
|
18
|
+
num_rows,
|
19
|
+
vocab_size,
|
20
|
+
logits_strides,
|
21
|
+
bitmask_strides,
|
22
|
+
NUM_SMS: tl.constexpr,
|
23
|
+
BLOCK_SIZE: tl.constexpr,
|
24
|
+
):
|
25
|
+
"""Apply a bitmask to logits in-place using Triton. The bitmask is a 01 bitwise compressed tensor,
|
26
|
+
where 0 means the token is masked and 1 means the token is not masked. After applying the bitmask,
|
27
|
+
the masked logits will be set to -inf.
|
28
|
+
|
29
|
+
Parameters
|
30
|
+
----------
|
31
|
+
logits_ptr : tl.tensor
|
32
|
+
Pointer to the logits tensor to apply the bitmask to.
|
33
|
+
|
34
|
+
bitmask_ptr : tl.tensor
|
35
|
+
Pointer to the bitmask tensor to apply.
|
36
|
+
|
37
|
+
indices_ptr : Optional[tl.tensor]
|
38
|
+
Optional pointer to indices tensor specifying which rows to apply the mask to.
|
39
|
+
|
40
|
+
num_rows : int
|
41
|
+
Number of rows to process. If indices_ptr is provided, this is the number of unique indices.
|
42
|
+
|
43
|
+
vocab_size : int
|
44
|
+
Size of the vocabulary dimension. If the logits does not have a vocab padding, this is the
|
45
|
+
same as the logits's second dimension. Otherwise, this is the actual size of the vocabulary.
|
46
|
+
|
47
|
+
logits_strides : int
|
48
|
+
Stride between rows in the logits tensor.
|
49
|
+
|
50
|
+
bitmask_strides : int
|
51
|
+
Stride between rows in the bitmask tensor.
|
52
|
+
|
53
|
+
NUM_SMS : int
|
54
|
+
Number of streaming multiprocessors to use.
|
55
|
+
|
56
|
+
BLOCK_SIZE : int
|
57
|
+
Size of processing blocks.
|
58
|
+
"""
|
59
|
+
|
60
|
+
pid = tl.program_id(0)
|
61
|
+
num_blocks = tl.cdiv(vocab_size, BLOCK_SIZE)
|
62
|
+
for work_id in tl.range(pid, num_rows * num_blocks, NUM_SMS):
|
63
|
+
row_id = work_id // num_blocks
|
64
|
+
block_offset = (work_id % num_blocks) * BLOCK_SIZE
|
65
|
+
batch_id = row_id if indices_ptr is None else tl.load(indices_ptr + row_id)
|
66
|
+
offsets = block_offset + tl.arange(0, BLOCK_SIZE)
|
67
|
+
bitmask_offsets = block_offset // 32 + tl.arange(0, BLOCK_SIZE // 32)
|
68
|
+
vocab_mask = offsets < vocab_size
|
69
|
+
packed_bitmask_mask = bitmask_offsets < bitmask_strides
|
70
|
+
packed_bitmask = tl.load(
|
71
|
+
bitmask_ptr + batch_id * bitmask_strides + bitmask_offsets,
|
72
|
+
packed_bitmask_mask,
|
73
|
+
)
|
74
|
+
bitmask = ((packed_bitmask[:, None] >> (tl.arange(0, 32)[None, :])) & 1) == 0
|
75
|
+
bitmask = bitmask.reshape(BLOCK_SIZE)
|
76
|
+
|
77
|
+
tl.store(
|
78
|
+
logits_ptr + batch_id * logits_strides + offsets,
|
79
|
+
-float("inf"),
|
80
|
+
vocab_mask & bitmask,
|
81
|
+
)
|
82
|
+
|
83
|
+
|
84
|
+
def apply_token_bitmask_inplace_triton(
|
85
|
+
logits: torch.Tensor,
|
86
|
+
bitmask: torch.Tensor,
|
87
|
+
indices: Optional[Union[List[int], torch.Tensor]] = None,
|
88
|
+
):
|
89
|
+
NUM_SMS = get_device_core_count()
|
90
|
+
BLOCK_SIZE = 4096
|
91
|
+
BITS_PER_BLOCK = 32
|
92
|
+
|
93
|
+
# Check input dtype
|
94
|
+
assert bitmask.dtype == torch.int32, "bitmask must be of type int32"
|
95
|
+
|
96
|
+
# Check input tensor shapes.
|
97
|
+
logits_shape = logits.shape
|
98
|
+
bitmask_shape = bitmask.shape
|
99
|
+
if logits.ndim == 1:
|
100
|
+
logits_shape = (1, logits_shape[0])
|
101
|
+
if bitmask.ndim == 1:
|
102
|
+
bitmask_shape = (1, bitmask_shape[0])
|
103
|
+
|
104
|
+
required_bitmask_width = (logits_shape[1] + BITS_PER_BLOCK - 1) // BITS_PER_BLOCK
|
105
|
+
assert required_bitmask_width >= bitmask_shape[1], (
|
106
|
+
f"Bitmask width too large: allow at most {required_bitmask_width} int32s for "
|
107
|
+
f"logits' width {logits_shape[1]}, but got {bitmask_shape[1]}"
|
108
|
+
)
|
109
|
+
|
110
|
+
vocab_size = min(logits_shape[1], bitmask_shape[1] * BITS_PER_BLOCK)
|
111
|
+
|
112
|
+
num_rows = None
|
113
|
+
if isinstance(indices, list) or isinstance(indices, torch.Tensor):
|
114
|
+
indices = torch.tensor(indices, dtype=torch.int32, device=logits.device)
|
115
|
+
num_rows = indices.shape[0]
|
116
|
+
else:
|
117
|
+
assert (
|
118
|
+
logits_shape[0] == bitmask_shape[0]
|
119
|
+
), f"batch size mismatch: logits {logits_shape[0]} vs bitmask {bitmask_shape[0]}"
|
120
|
+
num_rows = logits_shape[0]
|
121
|
+
|
122
|
+
if NUM_SMS > 0:
|
123
|
+
grid = (NUM_SMS,)
|
124
|
+
else:
|
125
|
+
num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
|
126
|
+
grid = (num_rows * num_blocks,)
|
127
|
+
NUM_SMS = triton.next_power_of_2(grid[0])
|
128
|
+
|
129
|
+
apply_token_bitmask_inplace_kernel[grid](
|
130
|
+
logits,
|
131
|
+
bitmask,
|
132
|
+
indices,
|
133
|
+
num_rows,
|
134
|
+
vocab_size,
|
135
|
+
logits_shape[1],
|
136
|
+
bitmask_shape[1],
|
137
|
+
NUM_SMS,
|
138
|
+
BLOCK_SIZE,
|
139
|
+
num_warps=BLOCK_SIZE // 32 // (16 // logits.element_size()),
|
140
|
+
num_stages=3,
|
141
|
+
)
|
@@ -25,13 +25,16 @@ from xgrammar import (
|
|
25
25
|
StructuralTagItem,
|
26
26
|
TokenizerInfo,
|
27
27
|
allocate_token_bitmask,
|
28
|
-
apply_token_bitmask_inplace,
|
29
28
|
)
|
30
29
|
|
31
30
|
from sglang.srt.constrained.base_grammar_backend import (
|
32
31
|
BaseGrammarBackend,
|
33
32
|
BaseGrammarObject,
|
34
33
|
)
|
34
|
+
from sglang.srt.constrained.triton_ops.bitmask_ops import (
|
35
|
+
apply_token_bitmask_inplace_triton,
|
36
|
+
)
|
37
|
+
from sglang.srt.utils import get_bool_env_var
|
35
38
|
|
36
39
|
logger = logging.getLogger(__name__)
|
37
40
|
|
@@ -55,6 +58,18 @@ class XGrammarGrammar(BaseGrammarObject):
|
|
55
58
|
self.override_stop_tokens = override_stop_tokens
|
56
59
|
self.finished = False
|
57
60
|
|
61
|
+
# Fix (from vLLM team): postpone the import of apply_token_bitmask_inplace_kernels to the
|
62
|
+
# class init site to avoid re-initializing CUDA in forked subprocess.
|
63
|
+
from xgrammar.kernels import apply_token_bitmask_inplace_kernels
|
64
|
+
|
65
|
+
self.use_token_bitmask_triton = get_bool_env_var(
|
66
|
+
"SGLANG_TOKEN_BITMASK_TRITON", "false"
|
67
|
+
)
|
68
|
+
self.apply_vocab_mask_cuda = apply_token_bitmask_inplace_kernels.get(
|
69
|
+
"cuda", None
|
70
|
+
)
|
71
|
+
self.apply_vocab_mask_cpu = apply_token_bitmask_inplace_kernels.get("cpu", None)
|
72
|
+
|
58
73
|
def accept_token(self, token: int):
|
59
74
|
assert self.matcher.accept_token(token)
|
60
75
|
|
@@ -97,9 +112,16 @@ class XGrammarGrammar(BaseGrammarObject):
|
|
97
112
|
def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
|
98
113
|
return vocab_mask.to(device, non_blocking=True)
|
99
114
|
|
100
|
-
|
101
|
-
|
102
|
-
|
115
|
+
def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
|
116
|
+
if (
|
117
|
+
not self.use_token_bitmask_triton
|
118
|
+
and logits.device.type == "cuda"
|
119
|
+
and self.apply_vocab_mask_cuda
|
120
|
+
):
|
121
|
+
return self.apply_vocab_mask_cuda(logits, vocab_mask)
|
122
|
+
if logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
|
123
|
+
return self.apply_vocab_mask_cpu(logits, vocab_mask)
|
124
|
+
apply_token_bitmask_inplace_triton(logits, vocab_mask)
|
103
125
|
|
104
126
|
def copy(self):
|
105
127
|
matcher = GrammarMatcher(
|
sglang/srt/custom_op.py
CHANGED
@@ -42,65 +42,3 @@ class CustomOp(nn.Module):
|
|
42
42
|
return self.forward_hip
|
43
43
|
else:
|
44
44
|
return self.forward_native
|
45
|
-
|
46
|
-
|
47
|
-
if _is_cuda:
|
48
|
-
from sgl_kernel import sgl_per_tensor_quant_fp8, sgl_per_token_quant_fp8
|
49
|
-
|
50
|
-
def scaled_fp8_quant(
|
51
|
-
input: torch.Tensor,
|
52
|
-
scale: Optional[torch.Tensor] = None,
|
53
|
-
num_token_padding: Optional[int] = None,
|
54
|
-
use_per_token_if_dynamic: bool = False,
|
55
|
-
) -> tuple[torch.Tensor, torch.Tensor]:
|
56
|
-
"""
|
57
|
-
Quantize input tensor to FP8 (8-bit floating point) format.
|
58
|
-
|
59
|
-
Args:
|
60
|
-
input (torch.Tensor): Input tensor to be quantized
|
61
|
-
scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
|
62
|
-
If None, scales will be computed dynamically.
|
63
|
-
num_token_padding (Optional[int]): If specified, pad the first dimension
|
64
|
-
of the output to at least this value.
|
65
|
-
use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
|
66
|
-
determines the quantization granularity:
|
67
|
-
- True: compute scale per token
|
68
|
-
- False: compute single scale per tensor
|
69
|
-
|
70
|
-
Returns:
|
71
|
-
Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
|
72
|
-
- quantized_tensor: The FP8 quantized version of input
|
73
|
-
- scale_tensor: The scaling factors used for quantization
|
74
|
-
|
75
|
-
Raises:
|
76
|
-
AssertionError: If input is not 2D or if static scale's numel != 1
|
77
|
-
"""
|
78
|
-
assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
|
79
|
-
shape = input.shape
|
80
|
-
out_dtype = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
|
81
|
-
if num_token_padding:
|
82
|
-
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
83
|
-
output = torch.empty(shape, device=input.device, dtype=out_dtype)
|
84
|
-
|
85
|
-
if scale is None:
|
86
|
-
# Dynamic scaling
|
87
|
-
if use_per_token_if_dynamic:
|
88
|
-
scale = torch.empty(
|
89
|
-
(shape[0], 1), device=input.device, dtype=torch.float32
|
90
|
-
)
|
91
|
-
sgl_per_token_quant_fp8(input, output, scale)
|
92
|
-
else:
|
93
|
-
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
|
94
|
-
sgl_per_tensor_quant_fp8(
|
95
|
-
input, output, scale, is_static=False
|
96
|
-
) # False for dynamic
|
97
|
-
else:
|
98
|
-
# Static scaling
|
99
|
-
assert (
|
100
|
-
scale.numel() == 1
|
101
|
-
), f"Expected scalar scale, got numel={scale.numel()}"
|
102
|
-
sgl_per_tensor_quant_fp8(
|
103
|
-
input, output, scale, is_static=True
|
104
|
-
) # True for static
|
105
|
-
|
106
|
-
return output, scale
|
@@ -35,6 +35,7 @@ from sglang.srt.disaggregation.utils import (
|
|
35
35
|
ReqToMetadataIdxAllocator,
|
36
36
|
TransferBackend,
|
37
37
|
get_kv_class,
|
38
|
+
kv_to_page_indices,
|
38
39
|
poll_and_all_reduce,
|
39
40
|
)
|
40
41
|
from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
|
@@ -121,7 +122,7 @@ class DecodePreallocQueue:
|
|
121
122
|
kv_args.aux_item_lens = [
|
122
123
|
metadata_buffer[0].nbytes for metadata_buffer in self.metadata_buffers
|
123
124
|
]
|
124
|
-
kv_args.ib_device =
|
125
|
+
kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
|
125
126
|
kv_args.gpu_id = self.scheduler.gpu_id
|
126
127
|
kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
|
127
128
|
kv_manager = kv_manager_class(
|
@@ -205,7 +206,10 @@ class DecodePreallocQueue:
|
|
205
206
|
self.req_to_metadata_buffer_idx_allocator.alloc()
|
206
207
|
)
|
207
208
|
assert decode_req.metadata_buffer_index is not None
|
208
|
-
|
209
|
+
page_indices = kv_to_page_indices(
|
210
|
+
kv_indices, self.token_to_kv_pool_allocator.page_size
|
211
|
+
)
|
212
|
+
decode_req.kv_receiver.init(page_indices, decode_req.metadata_buffer_index)
|
209
213
|
preallocated_reqs.append(decode_req)
|
210
214
|
indices_to_remove.add(i)
|
211
215
|
|
@@ -245,10 +249,30 @@ class DecodePreallocQueue:
|
|
245
249
|
assert req_pool_indices is not None
|
246
250
|
|
247
251
|
req.req_pool_idx = req_pool_indices[0]
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
+
if self.token_to_kv_pool_allocator.page_size == 1:
|
253
|
+
kv_loc = self.token_to_kv_pool_allocator.alloc(
|
254
|
+
len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
|
255
|
+
)
|
256
|
+
else:
|
257
|
+
num_tokens = len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
|
258
|
+
kv_loc = self.token_to_kv_pool_allocator.alloc_extend(
|
259
|
+
prefix_lens=torch.tensor(
|
260
|
+
[0],
|
261
|
+
dtype=torch.int64,
|
262
|
+
device=self.token_to_kv_pool_allocator.device,
|
263
|
+
),
|
264
|
+
seq_lens=torch.tensor(
|
265
|
+
[num_tokens],
|
266
|
+
dtype=torch.int64,
|
267
|
+
device=self.token_to_kv_pool_allocator.device,
|
268
|
+
),
|
269
|
+
last_loc=torch.tensor(
|
270
|
+
[-1],
|
271
|
+
dtype=torch.int64,
|
272
|
+
device=self.token_to_kv_pool_allocator.device,
|
273
|
+
),
|
274
|
+
extend_num_tokens=num_tokens,
|
275
|
+
)
|
252
276
|
assert kv_loc is not None
|
253
277
|
|
254
278
|
self.req_to_token_pool.write((req.req_pool_idx, slice(0, len(kv_loc))), kv_loc)
|
@@ -419,6 +443,38 @@ class ScheduleBatchDisaggregationDecodeMixin:
|
|
419
443
|
|
420
444
|
class SchedulerDisaggregationDecodeMixin:
|
421
445
|
|
446
|
+
@torch.no_grad()
|
447
|
+
def event_loop_normal_disagg_decode(self):
|
448
|
+
"""A normal scheduler loop for decode worker in disaggregation mode."""
|
449
|
+
|
450
|
+
while True:
|
451
|
+
recv_reqs = self.recv_requests()
|
452
|
+
self.process_input_requests(recv_reqs)
|
453
|
+
# polling and allocating kv cache
|
454
|
+
self.process_decode_queue()
|
455
|
+
batch = self.get_next_disagg_decode_batch_to_run()
|
456
|
+
self.cur_batch = batch
|
457
|
+
|
458
|
+
if batch:
|
459
|
+
# Generate fake extend output.
|
460
|
+
if batch.forward_mode.is_extend():
|
461
|
+
# Note: Logprobs should be handled on the prefill engine.
|
462
|
+
self.stream_output(batch.reqs, False)
|
463
|
+
else:
|
464
|
+
result = self.run_batch(batch)
|
465
|
+
self.process_batch_result(batch, result)
|
466
|
+
|
467
|
+
if batch is None and (
|
468
|
+
len(self.disagg_decode_transfer_queue.queue)
|
469
|
+
+ len(self.disagg_decode_prealloc_queue.queue)
|
470
|
+
== 0
|
471
|
+
):
|
472
|
+
# When the server is idle, do self-check and re-init some states
|
473
|
+
self.check_memory()
|
474
|
+
self.new_token_ratio = self.init_new_token_ratio
|
475
|
+
|
476
|
+
self.last_batch = batch
|
477
|
+
|
422
478
|
def get_next_disagg_decode_batch_to_run(
|
423
479
|
self: Scheduler,
|
424
480
|
) -> Optional[Tuple[ScheduleBatch, bool]]:
|
@@ -26,7 +26,11 @@ class MiniLoadBalancer:
|
|
26
26
|
self, modified_request, prefill_server, decode_server
|
27
27
|
) -> ORJSONResponse:
|
28
28
|
|
29
|
-
async with aiohttp.ClientSession(
|
29
|
+
async with aiohttp.ClientSession(
|
30
|
+
timeout=aiohttp.ClientTimeout(
|
31
|
+
total=3600
|
32
|
+
) # Add timeout for request reliability
|
33
|
+
) as session:
|
30
34
|
tasks = [
|
31
35
|
session.post(f"{prefill_server}/generate", json=modified_request),
|
32
36
|
session.post(f"{decode_server}/generate", json=modified_request),
|