sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/model_config.py +2 -1
- sglang/srt/disaggregation/mini_lb.py +2 -2
- sglang/srt/distributed/parallel_state.py +46 -41
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/http_server.py +5 -1
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +3 -3
- sglang/srt/entrypoints/openai/serving_completions.py +3 -1
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -1
- sglang/srt/entrypoints/openai/serving_responses.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
- sglang/srt/layers/moe/ep_moe/layer.py +2 -7
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/utils.py +0 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
- sglang/srt/layers/quantization/modelopt_quant.py +35 -2
- sglang/srt/layers/quantization/mxfp4.py +4 -1
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +30 -25
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +0 -18
- sglang/srt/managers/cache_controller.py +42 -39
- sglang/srt/managers/detokenizer_manager.py +0 -34
- sglang/srt/managers/multi_tokenizer_mixin.py +48 -6
- sglang/srt/managers/schedule_policy.py +3 -2
- sglang/srt/managers/scheduler.py +7 -100
- sglang/srt/managers/scheduler_metrics_mixin.py +113 -7
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_manager.py +1 -0
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +15 -10
- sglang/srt/mem_cache/hiradix_cache.py +16 -0
- sglang/srt/mem_cache/memory_pool_host.py +18 -11
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +35 -6
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +32 -13
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/metrics/collector.py +12 -4
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/forward_batch_info.py +16 -17
- sglang/srt/model_executor/model_runner.py +1 -1
- sglang/srt/models/deepseek_v2.py +245 -36
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/gpt_oss.py +5 -4
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/longcat_flash.py +26 -15
- sglang/srt/models/longcat_flash_nextn.py +23 -15
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/qwen2_moe.py +4 -1
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/server_args.py +79 -2
- sglang/srt/speculative/eagle_worker.py +158 -112
- sglang/srt/utils.py +12 -10
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/utils.py +1 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/METADATA +2 -2
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/RECORD +83 -76
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
|
|
1
|
+
import logging
|
2
|
+
import uuid
|
3
|
+
|
4
|
+
import torch
|
5
|
+
from mooncake_store import MooncakeStore
|
6
|
+
|
7
|
+
from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
|
8
|
+
|
9
|
+
logging.basicConfig(
|
10
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
11
|
+
)
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def generate_batch_query_keys(kv_num: int, config: HiCacheStorageConfig):
|
16
|
+
keys = []
|
17
|
+
for _ in range(kv_num):
|
18
|
+
key = "test_" + str(uuid.uuid4())
|
19
|
+
keys.append(key)
|
20
|
+
set_keys = []
|
21
|
+
for key in keys:
|
22
|
+
if config.is_mla_model:
|
23
|
+
set_keys.append(key + "_k")
|
24
|
+
else:
|
25
|
+
set_keys.append(key + f"_{config.tp_rank}_k")
|
26
|
+
set_keys.append(key + f"_{config.tp_rank}_v")
|
27
|
+
get_keys = set_keys
|
28
|
+
exist_keys = keys
|
29
|
+
return set_keys, get_keys, exist_keys
|
30
|
+
|
31
|
+
|
32
|
+
def test_single_operation():
|
33
|
+
"""Test the set API with a single key-value pair."""
|
34
|
+
print("=" * 100)
|
35
|
+
print("Testing single operation")
|
36
|
+
|
37
|
+
buffer_size = 1024 * 1024 * 16 # 16MB
|
38
|
+
value_elements = 1024
|
39
|
+
store = MooncakeStore()
|
40
|
+
buffer = torch.randn(buffer_size, dtype=torch.float32)
|
41
|
+
store.register_buffer(buffer)
|
42
|
+
value_size = value_elements * buffer.element_size()
|
43
|
+
|
44
|
+
key = str(uuid.uuid4())
|
45
|
+
set_slice = buffer[:value_elements]
|
46
|
+
get_slice = buffer[value_elements : 2 * value_elements]
|
47
|
+
set_location = set_slice.data_ptr()
|
48
|
+
get_location = get_slice.data_ptr()
|
49
|
+
|
50
|
+
# Test set operation
|
51
|
+
result = store.set(key, target_location=set_location, target_sizes=value_size)
|
52
|
+
assert result is True, f"❌set operation failed for key: {key}"
|
53
|
+
|
54
|
+
# Test exists operation
|
55
|
+
assert store.exists(key), f"❌key {key} should exist after set operation"
|
56
|
+
|
57
|
+
# Test get operation
|
58
|
+
result = store.get(key, target_location=get_location, target_sizes=value_size)
|
59
|
+
assert result is True, f"❌get operation failed for key: {key}"
|
60
|
+
|
61
|
+
# Compare the data using proper tensor indices
|
62
|
+
assert torch.allclose(
|
63
|
+
set_slice, get_slice, atol=1e-6
|
64
|
+
), f"❌get operation failed for key: {key}"
|
65
|
+
|
66
|
+
logger.info(f"✅ Single operation passed")
|
67
|
+
|
68
|
+
|
69
|
+
def test_batch_operation(config: HiCacheStorageConfig):
|
70
|
+
"""Test the batch set/get APIs with multiple key-value pairs."""
|
71
|
+
print("=" * 100)
|
72
|
+
print(f"Testing batch operation with config: {config}")
|
73
|
+
|
74
|
+
buffer_size = 1024 * 1024 * 16 # 16MB
|
75
|
+
value_elements = 256
|
76
|
+
kv_num = 13
|
77
|
+
store = MooncakeStore(config)
|
78
|
+
buffer = torch.randn(buffer_size, dtype=torch.float32)
|
79
|
+
store.register_buffer(buffer)
|
80
|
+
value_size = value_elements * buffer.element_size()
|
81
|
+
|
82
|
+
set_keys, get_keys, exist_keys = generate_batch_query_keys(kv_num, config)
|
83
|
+
set_slices = [
|
84
|
+
buffer[i * value_elements : (i + 1) * value_elements]
|
85
|
+
for i in range(len(set_keys))
|
86
|
+
]
|
87
|
+
set_locations = [set_slice.data_ptr() for set_slice in set_slices]
|
88
|
+
target_sizes = [value_size for _ in range(len(set_keys))]
|
89
|
+
|
90
|
+
# Test batch set operation
|
91
|
+
result = store.batch_set(
|
92
|
+
set_keys, target_locations=set_locations, target_sizes=target_sizes
|
93
|
+
)
|
94
|
+
assert result is True, f"❌batch set operation failed"
|
95
|
+
|
96
|
+
# Test batch exists operation
|
97
|
+
assert store.batch_exists(
|
98
|
+
exist_keys
|
99
|
+
), f"❌keys should exist after batch set operation"
|
100
|
+
|
101
|
+
# Test batch get operation
|
102
|
+
get_slices = [
|
103
|
+
buffer[
|
104
|
+
(len(set_keys) + i)
|
105
|
+
* value_elements : (len(set_keys) + i + 1)
|
106
|
+
* value_elements
|
107
|
+
]
|
108
|
+
for i in range(len(get_keys))
|
109
|
+
]
|
110
|
+
get_locations = [get_slice.data_ptr() for get_slice in get_slices]
|
111
|
+
result = store.batch_get(
|
112
|
+
get_keys, target_locations=get_locations, target_sizes=target_sizes
|
113
|
+
)
|
114
|
+
assert result == kv_num, f"❌batch get operation failed"
|
115
|
+
for i in range(len(get_keys)):
|
116
|
+
assert torch.allclose(
|
117
|
+
set_slices[i], get_slices[i], atol=1e-6
|
118
|
+
), f"❌batch get operation failed for key: {get_keys[i]}"
|
119
|
+
|
120
|
+
logger.info(f"✅ Batch operation passed")
|
121
|
+
|
122
|
+
|
123
|
+
if __name__ == "__main__":
|
124
|
+
test_single_operation()
|
125
|
+
test_batch_operation(
|
126
|
+
HiCacheStorageConfig(
|
127
|
+
is_mla_model=False,
|
128
|
+
tp_rank=0,
|
129
|
+
tp_size=1,
|
130
|
+
model_name=None,
|
131
|
+
is_page_first_layout=True,
|
132
|
+
)
|
133
|
+
)
|
134
|
+
test_batch_operation(
|
135
|
+
HiCacheStorageConfig(
|
136
|
+
is_mla_model=True,
|
137
|
+
tp_rank=0,
|
138
|
+
tp_size=1,
|
139
|
+
model_name=None,
|
140
|
+
is_page_first_layout=True,
|
141
|
+
)
|
142
|
+
)
|
143
|
+
test_batch_operation(
|
144
|
+
HiCacheStorageConfig(
|
145
|
+
is_mla_model=False,
|
146
|
+
tp_rank=1,
|
147
|
+
tp_size=4,
|
148
|
+
model_name=None,
|
149
|
+
is_page_first_layout=True,
|
150
|
+
)
|
151
|
+
)
|
152
|
+
test_batch_operation(
|
153
|
+
HiCacheStorageConfig(
|
154
|
+
is_mla_model=True,
|
155
|
+
tp_rank=3,
|
156
|
+
tp_size=8,
|
157
|
+
model_name=None,
|
158
|
+
is_page_first_layout=True,
|
159
|
+
)
|
160
|
+
)
|
161
|
+
logger.info(f"✅ All tests passed")
|
sglang/srt/metrics/collector.py
CHANGED
@@ -18,6 +18,8 @@ from dataclasses import dataclass
|
|
18
18
|
from enum import Enum
|
19
19
|
from typing import Dict, List, Optional, Union
|
20
20
|
|
21
|
+
from sglang.srt.metrics.utils import generate_buckets
|
22
|
+
from sglang.srt.server_args import ServerArgs
|
21
23
|
from sglang.srt.utils import get_bool_env_var
|
22
24
|
|
23
25
|
SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
|
@@ -309,6 +311,7 @@ class SchedulerMetricsCollector:
|
|
309
311
|
class TokenizerMetricsCollector:
|
310
312
|
def __init__(
|
311
313
|
self,
|
314
|
+
server_args: ServerArgs,
|
312
315
|
labels: Dict[str, str],
|
313
316
|
bucket_time_to_first_token: Optional[List[float]] = None,
|
314
317
|
bucket_inter_token_latency: Optional[List[float]] = None,
|
@@ -334,7 +337,7 @@ class TokenizerMetricsCollector:
|
|
334
337
|
)
|
335
338
|
|
336
339
|
if collect_tokens_histogram:
|
337
|
-
|
340
|
+
default_bucket_prompt_tokens = [
|
338
341
|
100,
|
339
342
|
300,
|
340
343
|
500,
|
@@ -363,9 +366,11 @@ class TokenizerMetricsCollector:
|
|
363
366
|
name="sglang:prompt_tokens_histogram",
|
364
367
|
documentation="Histogram of prompt token length.",
|
365
368
|
labelnames=labels.keys(),
|
366
|
-
buckets=
|
369
|
+
buckets=generate_buckets(
|
370
|
+
server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
|
371
|
+
),
|
367
372
|
)
|
368
|
-
|
373
|
+
default_bucket_generation_tokens = [
|
369
374
|
100,
|
370
375
|
300,
|
371
376
|
500,
|
@@ -390,7 +395,10 @@ class TokenizerMetricsCollector:
|
|
390
395
|
name="sglang:generation_tokens_histogram",
|
391
396
|
documentation="Histogram of generation token length.",
|
392
397
|
labelnames=labels.keys(),
|
393
|
-
buckets=
|
398
|
+
buckets=generate_buckets(
|
399
|
+
server_args.generation_tokens_buckets,
|
400
|
+
default_bucket_generation_tokens,
|
401
|
+
),
|
394
402
|
)
|
395
403
|
|
396
404
|
self.cached_tokens_total = Counter(
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Copyright 2023-2025 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
"""Utilities for Prometheus Metrics."""
|
15
|
+
import math
|
16
|
+
from typing import List
|
17
|
+
|
18
|
+
|
19
|
+
def two_sides_exponential_buckets(
|
20
|
+
middle: float, base: float, count: int
|
21
|
+
) -> List[float]:
|
22
|
+
buckets = []
|
23
|
+
half_count = math.ceil(count / 2)
|
24
|
+
distance = 1
|
25
|
+
buckets.append(middle)
|
26
|
+
for i in range(half_count):
|
27
|
+
distance *= base
|
28
|
+
buckets.append(middle + distance)
|
29
|
+
buckets.append(max(0, middle - distance))
|
30
|
+
return sorted(set(buckets))
|
31
|
+
|
32
|
+
|
33
|
+
def generate_buckets(
|
34
|
+
buckets_rule: List[str], default_buckets: List[float]
|
35
|
+
) -> List[float]:
|
36
|
+
if not buckets_rule:
|
37
|
+
buckets_rule = ["default"]
|
38
|
+
|
39
|
+
assert len(buckets_rule) > 0
|
40
|
+
rule = buckets_rule[0]
|
41
|
+
if rule == "tse":
|
42
|
+
middle, base, count = buckets_rule[1:]
|
43
|
+
assert float(base) > 1.0, "Base must be greater than 1.0"
|
44
|
+
return two_sides_exponential_buckets(float(middle), float(base), int(count))
|
45
|
+
if rule == "default":
|
46
|
+
return sorted(set(default_buckets))
|
47
|
+
assert rule == "customer"
|
48
|
+
return sorted(set([float(x) for x in buckets_rule[1:]]))
|
@@ -516,24 +516,23 @@ class ForwardBatch:
|
|
516
516
|
for batch_idx in range(batch_size):
|
517
517
|
mm_input = batch.multimodal_inputs[batch_idx]
|
518
518
|
if self.forward_mode.is_decode():
|
519
|
-
mrope_position_deltas = (
|
520
|
-
[0]
|
521
|
-
if mm_input is None
|
522
|
-
else flatten_nested_list(mm_input.mrope_position_delta.tolist())
|
523
|
-
)
|
524
|
-
next_input_positions = []
|
525
|
-
for mrope_position_delta in mrope_position_deltas:
|
526
|
-
# batched deltas needs to be processed separately
|
527
|
-
# Convert list of lists to tensor with shape [3, seq_len]
|
528
|
-
next_input_positions += [
|
529
|
-
MRotaryEmbedding.get_next_input_positions(
|
530
|
-
mrope_position_delta,
|
531
|
-
int(self.seq_lens[batch_idx]) - 1,
|
532
|
-
int(self.seq_lens[batch_idx]),
|
533
|
-
)
|
534
|
-
]
|
535
519
|
# 3 * N
|
536
|
-
|
520
|
+
if mm_input is None:
|
521
|
+
mrope_positions_list[batch_idx] = torch.full(
|
522
|
+
(3, 1),
|
523
|
+
self.seq_lens[batch_idx] - 1,
|
524
|
+
dtype=torch.int64,
|
525
|
+
device=model_runner.device,
|
526
|
+
)
|
527
|
+
else:
|
528
|
+
mrope_position_deltas = mm_input.mrope_position_delta.flatten().to(
|
529
|
+
model_runner.device, non_blocking=True
|
530
|
+
)
|
531
|
+
mrope_positions_list[batch_idx] = (
|
532
|
+
(mrope_position_deltas + self.seq_lens[batch_idx] - 1)
|
533
|
+
.unsqueeze(0)
|
534
|
+
.repeat(3, 1)
|
535
|
+
)
|
537
536
|
elif self.forward_mode.is_extend():
|
538
537
|
extend_seq_len, extend_prefix_len = (
|
539
538
|
batch.extend_seq_lens[batch_idx],
|
@@ -1655,7 +1655,7 @@ class ModelRunner:
|
|
1655
1655
|
|
1656
1656
|
def apply_torch_tp(self):
|
1657
1657
|
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
1658
|
-
from sglang.srt.model_parallel import tensor_parallel
|
1658
|
+
from sglang.srt.layers.model_parallel import tensor_parallel
|
1659
1659
|
|
1660
1660
|
device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
|
1661
1661
|
tensor_parallel(self.model, device_mesh)
|