sglang 0.5.1__py3-none-any.whl → 0.5.1.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/disaggregation/decode.py +4 -0
- sglang/srt/disaggregation/prefill.py +4 -0
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/tool.py +7 -7
- sglang/srt/layers/attention/flashinfer_mla_backend.py +71 -89
- sglang/srt/layers/attention/utils.py +15 -94
- sglang/srt/layers/moe/cutlass_moe.py +0 -7
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +6 -2
- sglang/srt/layers/quantization/modelopt_quant.py +2 -2
- sglang/srt/lora/lora_manager.py +29 -12
- sglang/srt/managers/scheduler_metrics_mixin.py +15 -0
- sglang/srt/metrics/collector.py +5 -5
- sglang/srt/model_executor/cuda_graph_runner.py +2 -2
- sglang/srt/models/grok.py +0 -4
- sglang/srt/offloader.py +115 -0
- sglang/srt/server_args.py +0 -4
- sglang/srt/utils.py +0 -7
- sglang/test/test_cutlass_moe.py +33 -28
- sglang/version.py +1 -1
- {sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/METADATA +4 -4
- {sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/RECORD +25 -24
- {sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.dist-info → sglang-0.5.1.post2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,146 @@
|
|
1
|
+
{
|
2
|
+
"1": {
|
3
|
+
"BLOCK_SIZE_M": 16,
|
4
|
+
"BLOCK_SIZE_N": 64,
|
5
|
+
"BLOCK_SIZE_K": 128,
|
6
|
+
"GROUP_SIZE_M": 32,
|
7
|
+
"num_warps": 4,
|
8
|
+
"num_stages": 4
|
9
|
+
},
|
10
|
+
"2": {
|
11
|
+
"BLOCK_SIZE_M": 16,
|
12
|
+
"BLOCK_SIZE_N": 128,
|
13
|
+
"BLOCK_SIZE_K": 128,
|
14
|
+
"GROUP_SIZE_M": 1,
|
15
|
+
"num_warps": 4,
|
16
|
+
"num_stages": 4
|
17
|
+
},
|
18
|
+
"4": {
|
19
|
+
"BLOCK_SIZE_M": 16,
|
20
|
+
"BLOCK_SIZE_N": 128,
|
21
|
+
"BLOCK_SIZE_K": 128,
|
22
|
+
"GROUP_SIZE_M": 1,
|
23
|
+
"num_warps": 4,
|
24
|
+
"num_stages": 4
|
25
|
+
},
|
26
|
+
"8": {
|
27
|
+
"BLOCK_SIZE_M": 16,
|
28
|
+
"BLOCK_SIZE_N": 128,
|
29
|
+
"BLOCK_SIZE_K": 128,
|
30
|
+
"GROUP_SIZE_M": 1,
|
31
|
+
"num_warps": 4,
|
32
|
+
"num_stages": 4
|
33
|
+
},
|
34
|
+
"16": {
|
35
|
+
"BLOCK_SIZE_M": 16,
|
36
|
+
"BLOCK_SIZE_N": 128,
|
37
|
+
"BLOCK_SIZE_K": 128,
|
38
|
+
"GROUP_SIZE_M": 1,
|
39
|
+
"num_warps": 4,
|
40
|
+
"num_stages": 3
|
41
|
+
},
|
42
|
+
"24": {
|
43
|
+
"BLOCK_SIZE_M": 16,
|
44
|
+
"BLOCK_SIZE_N": 128,
|
45
|
+
"BLOCK_SIZE_K": 128,
|
46
|
+
"GROUP_SIZE_M": 1,
|
47
|
+
"num_warps": 4,
|
48
|
+
"num_stages": 3
|
49
|
+
},
|
50
|
+
"32": {
|
51
|
+
"BLOCK_SIZE_M": 16,
|
52
|
+
"BLOCK_SIZE_N": 128,
|
53
|
+
"BLOCK_SIZE_K": 64,
|
54
|
+
"GROUP_SIZE_M": 1,
|
55
|
+
"num_warps": 4,
|
56
|
+
"num_stages": 3
|
57
|
+
},
|
58
|
+
"48": {
|
59
|
+
"BLOCK_SIZE_M": 16,
|
60
|
+
"BLOCK_SIZE_N": 256,
|
61
|
+
"BLOCK_SIZE_K": 64,
|
62
|
+
"GROUP_SIZE_M": 1,
|
63
|
+
"num_warps": 4,
|
64
|
+
"num_stages": 3
|
65
|
+
},
|
66
|
+
"64": {
|
67
|
+
"BLOCK_SIZE_M": 16,
|
68
|
+
"BLOCK_SIZE_N": 128,
|
69
|
+
"BLOCK_SIZE_K": 128,
|
70
|
+
"GROUP_SIZE_M": 1,
|
71
|
+
"num_warps": 4,
|
72
|
+
"num_stages": 4
|
73
|
+
},
|
74
|
+
"96": {
|
75
|
+
"BLOCK_SIZE_M": 16,
|
76
|
+
"BLOCK_SIZE_N": 128,
|
77
|
+
"BLOCK_SIZE_K": 128,
|
78
|
+
"GROUP_SIZE_M": 1,
|
79
|
+
"num_warps": 4,
|
80
|
+
"num_stages": 3
|
81
|
+
},
|
82
|
+
"128": {
|
83
|
+
"BLOCK_SIZE_M": 16,
|
84
|
+
"BLOCK_SIZE_N": 128,
|
85
|
+
"BLOCK_SIZE_K": 128,
|
86
|
+
"GROUP_SIZE_M": 1,
|
87
|
+
"num_warps": 4,
|
88
|
+
"num_stages": 3
|
89
|
+
},
|
90
|
+
"256": {
|
91
|
+
"BLOCK_SIZE_M": 16,
|
92
|
+
"BLOCK_SIZE_N": 128,
|
93
|
+
"BLOCK_SIZE_K": 128,
|
94
|
+
"GROUP_SIZE_M": 1,
|
95
|
+
"num_warps": 4,
|
96
|
+
"num_stages": 3
|
97
|
+
},
|
98
|
+
"512": {
|
99
|
+
"BLOCK_SIZE_M": 16,
|
100
|
+
"BLOCK_SIZE_N": 128,
|
101
|
+
"BLOCK_SIZE_K": 128,
|
102
|
+
"GROUP_SIZE_M": 1,
|
103
|
+
"num_warps": 4,
|
104
|
+
"num_stages": 3
|
105
|
+
},
|
106
|
+
"1024": {
|
107
|
+
"BLOCK_SIZE_M": 32,
|
108
|
+
"BLOCK_SIZE_N": 128,
|
109
|
+
"BLOCK_SIZE_K": 128,
|
110
|
+
"GROUP_SIZE_M": 1,
|
111
|
+
"num_warps": 4,
|
112
|
+
"num_stages": 3
|
113
|
+
},
|
114
|
+
"1536": {
|
115
|
+
"BLOCK_SIZE_M": 64,
|
116
|
+
"BLOCK_SIZE_N": 128,
|
117
|
+
"BLOCK_SIZE_K": 128,
|
118
|
+
"GROUP_SIZE_M": 32,
|
119
|
+
"num_warps": 4,
|
120
|
+
"num_stages": 4
|
121
|
+
},
|
122
|
+
"2048": {
|
123
|
+
"BLOCK_SIZE_M": 64,
|
124
|
+
"BLOCK_SIZE_N": 128,
|
125
|
+
"BLOCK_SIZE_K": 128,
|
126
|
+
"GROUP_SIZE_M": 16,
|
127
|
+
"num_warps": 4,
|
128
|
+
"num_stages": 4
|
129
|
+
},
|
130
|
+
"3072": {
|
131
|
+
"BLOCK_SIZE_M": 64,
|
132
|
+
"BLOCK_SIZE_N": 128,
|
133
|
+
"BLOCK_SIZE_K": 128,
|
134
|
+
"GROUP_SIZE_M": 1,
|
135
|
+
"num_warps": 4,
|
136
|
+
"num_stages": 4
|
137
|
+
},
|
138
|
+
"4096": {
|
139
|
+
"BLOCK_SIZE_M": 64,
|
140
|
+
"BLOCK_SIZE_N": 128,
|
141
|
+
"BLOCK_SIZE_K": 128,
|
142
|
+
"GROUP_SIZE_M": 64,
|
143
|
+
"num_warps": 4,
|
144
|
+
"num_stages": 3
|
145
|
+
}
|
146
|
+
}
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
+
import torch
|
4
|
+
|
3
5
|
from sglang.srt.utils import get_bool_env_var, get_device_sm
|
4
6
|
|
5
7
|
logger = logging.getLogger(__name__)
|
@@ -7,8 +9,10 @@ logger = logging.getLogger(__name__)
|
|
7
9
|
|
8
10
|
def _compute_enable_deep_gemm():
|
9
11
|
sm_version = get_device_sm()
|
10
|
-
|
11
|
-
|
12
|
+
if sm_version < 90:
|
13
|
+
return False
|
14
|
+
# TODO fix deepgemm cu129 fp8 issue
|
15
|
+
if torch.version.cuda == "12.9":
|
12
16
|
return False
|
13
17
|
|
14
18
|
try:
|
@@ -876,7 +876,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
|
|
876
876
|
data=torch.empty(
|
877
877
|
layer.num_local_experts,
|
878
878
|
2 * intermediate_size_per_partition,
|
879
|
-
# 2 fp4 items are packed in the input dimension
|
880
879
|
hidden_size // self.quant_config.group_size,
|
881
880
|
dtype=weight_scale_dtype,
|
882
881
|
),
|
@@ -895,7 +894,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
|
|
895
894
|
data=torch.empty(
|
896
895
|
layer.num_local_experts,
|
897
896
|
hidden_size,
|
898
|
-
# 2 fp4 items are packed in the input dimension
|
899
897
|
intermediate_size_per_partition // self.quant_config.group_size,
|
900
898
|
dtype=weight_scale_dtype,
|
901
899
|
),
|
@@ -1212,11 +1210,13 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
|
|
1212
1210
|
|
1213
1211
|
# Process w13 weights
|
1214
1212
|
w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
|
1213
|
+
del layer.w13_weight_scale
|
1215
1214
|
layer.w13_blockscale_swizzled.data.copy_(w13_blockscale_swizzled)
|
1216
1215
|
layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
|
1217
1216
|
|
1218
1217
|
# Process w2 weights
|
1219
1218
|
w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
|
1219
|
+
del layer.w2_weight_scale
|
1220
1220
|
layer.w2_blockscale_swizzled.data.copy_(w2_blockscale_swizzled)
|
1221
1221
|
layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
|
1222
1222
|
|
sglang/srt/lora/lora_manager.py
CHANGED
@@ -420,20 +420,37 @@ class LoRAManager:
|
|
420
420
|
):
|
421
421
|
"""Infer LoRA target modules and max_lora_rank from loaded adapters if not provided."""
|
422
422
|
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
423
|
+
self.target_modules = (
|
424
|
+
get_normalized_target_modules(target_modules) if target_modules else set()
|
425
|
+
)
|
426
|
+
|
427
|
+
for lora_id, config in self.configs.items():
|
428
|
+
if not isinstance(config.target_modules, list):
|
429
|
+
raise ValueError(
|
430
|
+
f"SGLang currently only supports inferring LoRA target modules when a list of "
|
431
|
+
"suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
|
432
|
+
"specify `--lora-target-modules` during server startup. You can specify `all` to "
|
433
|
+
"enable all support modules types. "
|
434
|
+
)
|
435
|
+
|
436
|
+
adapter_target_modules = get_normalized_target_modules(
|
437
|
+
config.target_modules
|
438
|
+
)
|
439
|
+
|
440
|
+
if target_modules is not None:
|
441
|
+
# When `--lora-target-modules` is provided, validate adapter target modules is a subset of the specified target modules.
|
442
|
+
if not adapter_target_modules.issubset(self.target_modules):
|
443
|
+
unsupported_modules = adapter_target_modules - self.target_modules
|
444
|
+
lora_name = self.lora_refs[lora_id].lora_name
|
429
445
|
raise ValueError(
|
430
|
-
f"
|
431
|
-
"
|
432
|
-
"
|
433
|
-
"enable all
|
446
|
+
f"LoRA adapter '{lora_name}' contains target modules {sorted(unsupported_modules)} "
|
447
|
+
f"that are not included in the specified --lora-target-modules {sorted(self.target_modules)}. "
|
448
|
+
f"Please update --lora-target-modules to include all required modules: "
|
449
|
+
f"{sorted(self.target_modules | adapter_target_modules)}, or use 'all' to enable all supported modules."
|
434
450
|
)
|
435
|
-
|
436
|
-
|
451
|
+
else:
|
452
|
+
# Otherwise, infer target_modules from adapter configs.
|
453
|
+
self.target_modules.update(adapter_target_modules)
|
437
454
|
|
438
455
|
if max_lora_rank is not None:
|
439
456
|
self.max_lora_rank = max_lora_rank
|
@@ -125,6 +125,14 @@ class SchedulerMetricsMixin:
|
|
125
125
|
total_queue_latency += req.queue_time_end - req.queue_time_start
|
126
126
|
self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
|
127
127
|
|
128
|
+
if self.disaggregation_mode == DisaggregationMode.PREFILL:
|
129
|
+
self.stats.num_prefill_prealloc_queue_reqs = len(
|
130
|
+
self.disagg_prefill_bootstrap_queue.queue
|
131
|
+
)
|
132
|
+
self.stats.num_prefill_inflight_queue_reqs = len(
|
133
|
+
self.disagg_prefill_inflight_queue
|
134
|
+
)
|
135
|
+
|
128
136
|
self.metrics_collector.log_stats(self.stats)
|
129
137
|
self._emit_kv_metrics()
|
130
138
|
self._publish_kv_events()
|
@@ -202,6 +210,13 @@ class SchedulerMetricsMixin:
|
|
202
210
|
self.stats.spec_accept_length = spec_accept_length
|
203
211
|
self.stats.total_retracted_reqs = self.total_retracted_reqs
|
204
212
|
self.metrics_collector.log_stats(self.stats)
|
213
|
+
if self.disaggregation_mode == DisaggregationMode.DECODE:
|
214
|
+
self.stats.num_decode_prealloc_queue_reqs = len(
|
215
|
+
self.disagg_decode_prealloc_queue.queue
|
216
|
+
)
|
217
|
+
self.stats.num_decode_transfer_queue_reqs = len(
|
218
|
+
self.disagg_decode_transfer_queue.queue
|
219
|
+
)
|
205
220
|
self._emit_kv_metrics()
|
206
221
|
self._publish_kv_events()
|
207
222
|
|
sglang/srt/metrics/collector.py
CHANGED
@@ -142,7 +142,7 @@ class SchedulerStats:
|
|
142
142
|
spec_accept_length: float = 0.0
|
143
143
|
avg_request_queue_latency: float = 0.0
|
144
144
|
num_prefill_prealloc_queue_reqs: int = 0
|
145
|
-
|
145
|
+
num_prefill_inflight_queue_reqs: int = 0
|
146
146
|
num_decode_prealloc_queue_reqs: int = 0
|
147
147
|
num_decode_transfer_queue_reqs: int = 0
|
148
148
|
total_retracted_reqs: int = 0
|
@@ -235,9 +235,9 @@ class SchedulerMetricsCollector:
|
|
235
235
|
multiprocess_mode="mostrecent",
|
236
236
|
)
|
237
237
|
|
238
|
-
self.
|
239
|
-
name="sglang:
|
240
|
-
documentation="The number of requests in the prefill
|
238
|
+
self.num_prefill_inflight_queue_reqs = Gauge(
|
239
|
+
name="sglang:num_prefill_inflight_queue_reqs",
|
240
|
+
documentation="The number of requests in the prefill inflight queue.",
|
241
241
|
labelnames=labels.keys(),
|
242
242
|
multiprocess_mode="mostrecent",
|
243
243
|
)
|
@@ -294,7 +294,7 @@ class SchedulerMetricsCollector:
|
|
294
294
|
self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
|
295
295
|
)
|
296
296
|
self._log_gauge(
|
297
|
-
self.
|
297
|
+
self.num_prefill_inflight_queue_reqs, stats.num_prefill_inflight_queue_reqs
|
298
298
|
)
|
299
299
|
self._log_gauge(
|
300
300
|
self.num_decode_prealloc_queue_reqs, stats.num_decode_prealloc_queue_reqs
|
@@ -54,7 +54,7 @@ from sglang.srt.utils import (
|
|
54
54
|
empty_context,
|
55
55
|
get_available_gpu_memory,
|
56
56
|
get_device_memory_capacity,
|
57
|
-
|
57
|
+
log_info_on_rank0,
|
58
58
|
require_attn_tp_gather,
|
59
59
|
require_gathered_buffer,
|
60
60
|
require_mlp_sync,
|
@@ -267,7 +267,7 @@ class CudaGraphRunner:
|
|
267
267
|
|
268
268
|
# Batch sizes to capture
|
269
269
|
self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
|
270
|
-
|
270
|
+
log_info_on_rank0(logger, f"Capture cuda graph bs {self.capture_bs}")
|
271
271
|
self.capture_forward_mode = ForwardMode.DECODE
|
272
272
|
self.capture_hidden_mode = CaptureHiddenMode.NULL
|
273
273
|
self.num_tokens_per_bs = 1
|
sglang/srt/models/grok.py
CHANGED
@@ -842,10 +842,6 @@ class Grok1ForCausalLM(nn.Module):
|
|
842
842
|
if self.is_weights_presharded:
|
843
843
|
setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
|
844
844
|
|
845
|
-
default_replicate_lm_head = False
|
846
|
-
self.replicate_lm_head = getattr(
|
847
|
-
config, "replicate_lm_head", default_replicate_lm_head
|
848
|
-
)
|
849
845
|
self.replicate_embedding = getattr(config, "replicate_embedding", False)
|
850
846
|
|
851
847
|
self.model = Grok1Model(
|
sglang/srt/offloader.py
CHANGED
@@ -321,6 +321,7 @@ class _BaseParamOffloader(ABC):
|
|
321
321
|
@staticmethod
|
322
322
|
def create(mode: str, **kwargs) -> "_BaseParamOffloader":
|
323
323
|
return {
|
324
|
+
"meta": _MetaParamOffloader,
|
324
325
|
"cpu": _CpuParamOffloader,
|
325
326
|
"shm_cpu": _ShmCpuParamOffloader,
|
326
327
|
"sharded_gpu": _ShardedGpuParamOffloader,
|
@@ -341,6 +342,17 @@ class _BaseParamOffloader(ABC):
|
|
341
342
|
raise NotImplementedError
|
342
343
|
|
343
344
|
|
345
|
+
class _MetaParamOffloader(_BaseParamOffloader):
|
346
|
+
"""Usually used for debugging."""
|
347
|
+
|
348
|
+
def __init__(self, module, param_name):
|
349
|
+
super().__init__(module, param_name)
|
350
|
+
_move_param_to_meta(module, param_name)
|
351
|
+
|
352
|
+
def create_device_tensor(self):
|
353
|
+
return torch.empty_like(self._param.data, device="cuda")
|
354
|
+
|
355
|
+
|
344
356
|
class _CpuParamOffloader(_BaseParamOffloader):
|
345
357
|
def __init__(self, module, param_name):
|
346
358
|
super().__init__(module, param_name)
|
@@ -431,3 +443,106 @@ def _empty_strided_like(x: torch.Tensor, device, pin_memory=False):
|
|
431
443
|
device=device,
|
432
444
|
pin_memory=pin_memory,
|
433
445
|
)
|
446
|
+
|
447
|
+
|
448
|
+
# ----------------------------------------- ShardedGpu ------------------------------------------------------
|
449
|
+
|
450
|
+
|
451
|
+
# TODO unify with ShmCpu mode
|
452
|
+
class _ShardedGpuParamOffloader(_BaseParamOffloader):
|
453
|
+
def __init__(self, module, param_name):
|
454
|
+
super().__init__(module, param_name)
|
455
|
+
self._rank = get_naive_distributed().get_rank()
|
456
|
+
self._world_size = get_naive_distributed().get_world_size()
|
457
|
+
|
458
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
459
|
+
|
460
|
+
assert get_tensor_model_parallel_world_size() == 1, "not yet support tp_size!=1"
|
461
|
+
assert (
|
462
|
+
self._param.data.is_contiguous()
|
463
|
+
), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
|
464
|
+
|
465
|
+
if self._rank == 0:
|
466
|
+
_move_param_to_cpu(self._param, pin_memory=True)
|
467
|
+
else:
|
468
|
+
_move_param_to_meta(self._module, self._param_name)
|
469
|
+
|
470
|
+
self.sharded_param_handles = None
|
471
|
+
|
472
|
+
def post_init(self):
|
473
|
+
# check again since it may be changed
|
474
|
+
assert (
|
475
|
+
self._param.data.is_contiguous()
|
476
|
+
), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
|
477
|
+
|
478
|
+
scatter_src = self._param.data
|
479
|
+
|
480
|
+
logger.info(
|
481
|
+
f"[offloader] post_init {scatter_src.nbytes=} {scatter_src.dtype=} {scatter_src.shape=} {torch.cuda.memory_allocated()=}"
|
482
|
+
)
|
483
|
+
|
484
|
+
if self._rank == 0:
|
485
|
+
scatter_src = scatter_src.to("cuda")
|
486
|
+
scatter_list = _even_chunk(scatter_src, self._world_size)
|
487
|
+
|
488
|
+
sharded_param = torch.empty(
|
489
|
+
scatter_list[0].shape, dtype=scatter_list[0].dtype, device="cuda"
|
490
|
+
)
|
491
|
+
self.sharded_param_handles = _create_shared_buffer_tensors(
|
492
|
+
local_tensor=sharded_param
|
493
|
+
)
|
494
|
+
|
495
|
+
get_naive_distributed().scatter(
|
496
|
+
sharded_param, scatter_list if self._rank == 0 else None
|
497
|
+
)
|
498
|
+
|
499
|
+
_move_param_to_meta(self._module, self._param_name)
|
500
|
+
|
501
|
+
def create_device_tensor(self):
|
502
|
+
output = _empty_strided_like(self._param, device="cuda")
|
503
|
+
output_chunks = output.chunk(self._world_size)
|
504
|
+
|
505
|
+
for index in range(self._world_size):
|
506
|
+
src_rank = (self._rank + index) % self._world_size
|
507
|
+
src_buf = self.sharded_param_handles[src_rank]
|
508
|
+
output_chunks[src_rank].copy_(src_buf)
|
509
|
+
|
510
|
+
return output
|
511
|
+
|
512
|
+
|
513
|
+
def _even_chunk(x: torch.Tensor, chunks: int):
|
514
|
+
assert x.shape[0] % chunks == 0, f"{x.shape=} {chunks=}"
|
515
|
+
return list(x.chunk(chunks))
|
516
|
+
|
517
|
+
|
518
|
+
def _create_shared_buffer_tensors(local_tensor: torch.Tensor) -> List[torch.Tensor]:
|
519
|
+
self_rank = get_naive_distributed().get_rank()
|
520
|
+
world_size = get_naive_distributed().get_world_size()
|
521
|
+
|
522
|
+
object_list = get_naive_distributed().all_gather_object(
|
523
|
+
dict(
|
524
|
+
dup_serialized_local_tensor=[
|
525
|
+
(
|
526
|
+
None
|
527
|
+
if interesting_rank == self_rank
|
528
|
+
else MultiprocessingSerializer.serialize(local_tensor)
|
529
|
+
)
|
530
|
+
for interesting_rank in range(world_size)
|
531
|
+
]
|
532
|
+
)
|
533
|
+
)
|
534
|
+
|
535
|
+
output_tensors = []
|
536
|
+
for output_rank in range(world_size):
|
537
|
+
remote_serialized_tensor = object_list[output_rank][
|
538
|
+
"dup_serialized_local_tensor"
|
539
|
+
][self_rank]
|
540
|
+
if output_rank == self_rank:
|
541
|
+
assert remote_serialized_tensor is None
|
542
|
+
output_tensors.append(local_tensor)
|
543
|
+
else:
|
544
|
+
output_tensors.append(
|
545
|
+
MultiprocessingSerializer.deserialize(remote_serialized_tensor)
|
546
|
+
)
|
547
|
+
|
548
|
+
return output_tensors
|
sglang/srt/server_args.py
CHANGED
@@ -639,10 +639,6 @@ class ServerArgs:
|
|
639
639
|
logger.warning(
|
640
640
|
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
641
641
|
)
|
642
|
-
if self.page_size != 1 and self.attention_backend == "flashinfer":
|
643
|
-
raise ValueError(
|
644
|
-
"Speculative decoding with page_size != 1 is not supported. Please set page_size to 1."
|
645
|
-
)
|
646
642
|
|
647
643
|
# Auto choose parameters
|
648
644
|
if self.speculative_num_steps is None:
|
sglang/srt/utils.py
CHANGED
@@ -2002,13 +2002,6 @@ def configure_ipv6(dist_init_addr):
|
|
2002
2002
|
return port, host
|
2003
2003
|
|
2004
2004
|
|
2005
|
-
def rank0_log(msg: str):
|
2006
|
-
from sglang.srt.distributed import get_tensor_model_parallel_rank
|
2007
|
-
|
2008
|
-
if get_tensor_model_parallel_rank() == 0:
|
2009
|
-
logger.info(msg)
|
2010
|
-
|
2011
|
-
|
2012
2005
|
def launch_dummy_health_check_server(host, port, enable_metrics):
|
2013
2006
|
import asyncio
|
2014
2007
|
|
sglang/test/test_cutlass_moe.py
CHANGED
@@ -8,6 +8,15 @@ from transformers import AutoConfig
|
|
8
8
|
|
9
9
|
from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
|
10
10
|
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
|
11
|
+
from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
|
12
|
+
|
13
|
+
|
14
|
+
# Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
|
15
|
+
def calc_diff(x, y):
|
16
|
+
x, y = x.double(), y.double()
|
17
|
+
denominator = (x * x + y * y).sum()
|
18
|
+
sim = 2 * (x * y).sum() / denominator
|
19
|
+
return 1 - sim
|
11
20
|
|
12
21
|
|
13
22
|
def get_model_config(tp_size: int):
|
@@ -69,16 +78,11 @@ def run_test(tp_size, batch_size, model_config, check=False):
|
|
69
78
|
|
70
79
|
# --- Input Data ---
|
71
80
|
# Use bf16/fp16 for input activation based on model config
|
72
|
-
x = torch.randn((batch_size, H), device="cuda", dtype=dtype)
|
81
|
+
x = torch.randn((batch_size, H), device="cuda", dtype=dtype)
|
73
82
|
# --- Weights (Generate in higher precision, then convert to FP8) ---
|
74
83
|
# Generate weights suitable for FP8 conversion (e.g., scaled appropriately)
|
75
|
-
w1_hp = (
|
76
|
-
|
77
|
-
)
|
78
|
-
w2_hp = (
|
79
|
-
torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32) * 0.00001
|
80
|
-
+ 0.00001
|
81
|
-
)
|
84
|
+
w1_hp = torch.randn((E, I, H), device="cuda", dtype=torch.float32)
|
85
|
+
w2_hp = torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32)
|
82
86
|
|
83
87
|
w1 = to_fp8(w1_hp)
|
84
88
|
w2 = to_fp8(w2_hp)
|
@@ -149,13 +153,13 @@ def run_test(tp_size, batch_size, model_config, check=False):
|
|
149
153
|
)
|
150
154
|
|
151
155
|
# Note: Triton expects non-transposed weights
|
156
|
+
moe_config = MoeRunnerConfig(inplace=False)
|
152
157
|
triton_lambda = lambda: fused_experts(
|
153
158
|
x,
|
154
159
|
w1,
|
155
160
|
w2,
|
156
161
|
(topk_weights, topk_ids, "dummy"),
|
157
|
-
|
158
|
-
activation="silu", # Assuming SiLU activation common in MoEs
|
162
|
+
moe_config,
|
159
163
|
use_fp8_w8a8=True,
|
160
164
|
w1_scale=w1_scale,
|
161
165
|
w2_scale=w2_scale,
|
@@ -221,32 +225,19 @@ def run_test(tp_size, batch_size, model_config, check=False):
|
|
221
225
|
w1, # Original shape
|
222
226
|
w2, # Original shape
|
223
227
|
(topk_weights, topk_ids, "dummy"),
|
224
|
-
|
225
|
-
activation="silu",
|
228
|
+
moe_config,
|
226
229
|
use_fp8_w8a8=True,
|
227
230
|
w1_scale=w1_scale,
|
228
231
|
w2_scale=w2_scale,
|
229
232
|
block_shape=block_shape,
|
230
233
|
)
|
231
234
|
|
232
|
-
|
233
|
-
|
234
|
-
y_triton = y_triton.to(dtype)
|
235
|
-
|
236
|
-
abs_error = torch.abs(y_cutlass - y_triton)
|
237
|
-
rel_error = abs_error / torch.clamp(torch.abs(y_triton), min=1e-2)
|
238
|
-
|
239
|
-
max_abs_err = abs_error.max().item()
|
240
|
-
max_rel_err = rel_error.max().item()
|
241
|
-
|
242
|
-
print("y_cutlass:", y_cutlass[:, :10])
|
243
|
-
print("y_triton:", y_triton[:, :10])
|
244
|
-
print(f"Max absolute error: {max_abs_err:.6f}")
|
245
|
-
print(f"Max relative error: {max_rel_err:.6f}")
|
235
|
+
diff = calc_diff(y_cutlass, y_triton)
|
236
|
+
print(f"Diff: {diff:.6f}")
|
246
237
|
|
247
238
|
# Tolerance might need adjustment based on FP8 specifics and kernel differences
|
248
239
|
# FP8 comparisons often require higher tolerance than FP16/BF16
|
249
|
-
assert
|
240
|
+
assert diff < 1e-4, f"Diff too high! {diff}"
|
250
241
|
print("Correctness check passed.")
|
251
242
|
|
252
243
|
|
@@ -264,7 +255,21 @@ if __name__ == "__main__":
|
|
264
255
|
"--batch-sizes",
|
265
256
|
type=int,
|
266
257
|
nargs="+",
|
267
|
-
default=[
|
258
|
+
default=[
|
259
|
+
1,
|
260
|
+
4,
|
261
|
+
8,
|
262
|
+
16,
|
263
|
+
32,
|
264
|
+
64,
|
265
|
+
128,
|
266
|
+
256,
|
267
|
+
512,
|
268
|
+
1024,
|
269
|
+
2048,
|
270
|
+
4096,
|
271
|
+
8192,
|
272
|
+
], # Adjusted default
|
268
273
|
help="List of batch sizes to test",
|
269
274
|
)
|
270
275
|
parser.add_argument("--check", action="store_true", help="Enable check mode")
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.1"
|
1
|
+
__version__ = "0.5.1.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.1
|
3
|
+
Version: 0.5.1.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -262,7 +262,7 @@ Requires-Dist: torch==2.8.0; extra == "srt"
|
|
262
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
263
263
|
Requires-Dist: torchvision; extra == "srt"
|
264
264
|
Requires-Dist: cuda-python; extra == "srt"
|
265
|
-
Requires-Dist: flashinfer_python==0.2.
|
265
|
+
Requires-Dist: flashinfer_python==0.2.14.post1; extra == "srt"
|
266
266
|
Provides-Extra: blackwell
|
267
267
|
Requires-Dist: sglang[runtime_common]; extra == "blackwell"
|
268
268
|
Requires-Dist: sgl-kernel; extra == "blackwell"
|
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
|
|
270
270
|
Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
|
271
271
|
Requires-Dist: torchvision; extra == "blackwell"
|
272
272
|
Requires-Dist: cuda-python; extra == "blackwell"
|
273
|
-
Requires-Dist: flashinfer_python==0.2.
|
273
|
+
Requires-Dist: flashinfer_python==0.2.14.post1; extra == "blackwell"
|
274
274
|
Provides-Extra: srt-hip
|
275
275
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
276
276
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -374,7 +374,7 @@ Dynamic: license-file
|
|
374
374
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
375
375
|
|
376
376
|
## News
|
377
|
-
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [
|
377
|
+
- [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
|
378
378
|
- [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
|
379
379
|
- [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
|
380
380
|
- [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).
|