sglang 0.5.1__py3-none-any.whl → 0.5.1.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,146 @@
1
+ {
2
+ "1": {
3
+ "BLOCK_SIZE_M": 16,
4
+ "BLOCK_SIZE_N": 64,
5
+ "BLOCK_SIZE_K": 128,
6
+ "GROUP_SIZE_M": 32,
7
+ "num_warps": 4,
8
+ "num_stages": 4
9
+ },
10
+ "2": {
11
+ "BLOCK_SIZE_M": 16,
12
+ "BLOCK_SIZE_N": 128,
13
+ "BLOCK_SIZE_K": 128,
14
+ "GROUP_SIZE_M": 1,
15
+ "num_warps": 4,
16
+ "num_stages": 4
17
+ },
18
+ "4": {
19
+ "BLOCK_SIZE_M": 16,
20
+ "BLOCK_SIZE_N": 128,
21
+ "BLOCK_SIZE_K": 128,
22
+ "GROUP_SIZE_M": 1,
23
+ "num_warps": 4,
24
+ "num_stages": 4
25
+ },
26
+ "8": {
27
+ "BLOCK_SIZE_M": 16,
28
+ "BLOCK_SIZE_N": 128,
29
+ "BLOCK_SIZE_K": 128,
30
+ "GROUP_SIZE_M": 1,
31
+ "num_warps": 4,
32
+ "num_stages": 4
33
+ },
34
+ "16": {
35
+ "BLOCK_SIZE_M": 16,
36
+ "BLOCK_SIZE_N": 128,
37
+ "BLOCK_SIZE_K": 128,
38
+ "GROUP_SIZE_M": 1,
39
+ "num_warps": 4,
40
+ "num_stages": 3
41
+ },
42
+ "24": {
43
+ "BLOCK_SIZE_M": 16,
44
+ "BLOCK_SIZE_N": 128,
45
+ "BLOCK_SIZE_K": 128,
46
+ "GROUP_SIZE_M": 1,
47
+ "num_warps": 4,
48
+ "num_stages": 3
49
+ },
50
+ "32": {
51
+ "BLOCK_SIZE_M": 16,
52
+ "BLOCK_SIZE_N": 128,
53
+ "BLOCK_SIZE_K": 64,
54
+ "GROUP_SIZE_M": 1,
55
+ "num_warps": 4,
56
+ "num_stages": 3
57
+ },
58
+ "48": {
59
+ "BLOCK_SIZE_M": 16,
60
+ "BLOCK_SIZE_N": 256,
61
+ "BLOCK_SIZE_K": 64,
62
+ "GROUP_SIZE_M": 1,
63
+ "num_warps": 4,
64
+ "num_stages": 3
65
+ },
66
+ "64": {
67
+ "BLOCK_SIZE_M": 16,
68
+ "BLOCK_SIZE_N": 128,
69
+ "BLOCK_SIZE_K": 128,
70
+ "GROUP_SIZE_M": 1,
71
+ "num_warps": 4,
72
+ "num_stages": 4
73
+ },
74
+ "96": {
75
+ "BLOCK_SIZE_M": 16,
76
+ "BLOCK_SIZE_N": 128,
77
+ "BLOCK_SIZE_K": 128,
78
+ "GROUP_SIZE_M": 1,
79
+ "num_warps": 4,
80
+ "num_stages": 3
81
+ },
82
+ "128": {
83
+ "BLOCK_SIZE_M": 16,
84
+ "BLOCK_SIZE_N": 128,
85
+ "BLOCK_SIZE_K": 128,
86
+ "GROUP_SIZE_M": 1,
87
+ "num_warps": 4,
88
+ "num_stages": 3
89
+ },
90
+ "256": {
91
+ "BLOCK_SIZE_M": 16,
92
+ "BLOCK_SIZE_N": 128,
93
+ "BLOCK_SIZE_K": 128,
94
+ "GROUP_SIZE_M": 1,
95
+ "num_warps": 4,
96
+ "num_stages": 3
97
+ },
98
+ "512": {
99
+ "BLOCK_SIZE_M": 16,
100
+ "BLOCK_SIZE_N": 128,
101
+ "BLOCK_SIZE_K": 128,
102
+ "GROUP_SIZE_M": 1,
103
+ "num_warps": 4,
104
+ "num_stages": 3
105
+ },
106
+ "1024": {
107
+ "BLOCK_SIZE_M": 32,
108
+ "BLOCK_SIZE_N": 128,
109
+ "BLOCK_SIZE_K": 128,
110
+ "GROUP_SIZE_M": 1,
111
+ "num_warps": 4,
112
+ "num_stages": 3
113
+ },
114
+ "1536": {
115
+ "BLOCK_SIZE_M": 64,
116
+ "BLOCK_SIZE_N": 128,
117
+ "BLOCK_SIZE_K": 128,
118
+ "GROUP_SIZE_M": 32,
119
+ "num_warps": 4,
120
+ "num_stages": 4
121
+ },
122
+ "2048": {
123
+ "BLOCK_SIZE_M": 64,
124
+ "BLOCK_SIZE_N": 128,
125
+ "BLOCK_SIZE_K": 128,
126
+ "GROUP_SIZE_M": 16,
127
+ "num_warps": 4,
128
+ "num_stages": 4
129
+ },
130
+ "3072": {
131
+ "BLOCK_SIZE_M": 64,
132
+ "BLOCK_SIZE_N": 128,
133
+ "BLOCK_SIZE_K": 128,
134
+ "GROUP_SIZE_M": 1,
135
+ "num_warps": 4,
136
+ "num_stages": 4
137
+ },
138
+ "4096": {
139
+ "BLOCK_SIZE_M": 64,
140
+ "BLOCK_SIZE_N": 128,
141
+ "BLOCK_SIZE_K": 128,
142
+ "GROUP_SIZE_M": 64,
143
+ "num_warps": 4,
144
+ "num_stages": 3
145
+ }
146
+ }
@@ -1,5 +1,7 @@
1
1
  import logging
2
2
 
3
+ import torch
4
+
3
5
  from sglang.srt.utils import get_bool_env_var, get_device_sm
4
6
 
5
7
  logger = logging.getLogger(__name__)
@@ -7,8 +9,10 @@ logger = logging.getLogger(__name__)
7
9
 
8
10
  def _compute_enable_deep_gemm():
9
11
  sm_version = get_device_sm()
10
- # TODO fix blackwell fp8
11
- if sm_version != 90:
12
+ if sm_version < 90:
13
+ return False
14
+ # TODO fix deepgemm cu129 fp8 issue
15
+ if torch.version.cuda == "12.9":
12
16
  return False
13
17
 
14
18
  try:
@@ -876,7 +876,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
876
876
  data=torch.empty(
877
877
  layer.num_local_experts,
878
878
  2 * intermediate_size_per_partition,
879
- # 2 fp4 items are packed in the input dimension
880
879
  hidden_size // self.quant_config.group_size,
881
880
  dtype=weight_scale_dtype,
882
881
  ),
@@ -895,7 +894,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
895
894
  data=torch.empty(
896
895
  layer.num_local_experts,
897
896
  hidden_size,
898
- # 2 fp4 items are packed in the input dimension
899
897
  intermediate_size_per_partition // self.quant_config.group_size,
900
898
  dtype=weight_scale_dtype,
901
899
  ),
@@ -1212,11 +1210,13 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
1212
1210
 
1213
1211
  # Process w13 weights
1214
1212
  w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
1213
+ del layer.w13_weight_scale
1215
1214
  layer.w13_blockscale_swizzled.data.copy_(w13_blockscale_swizzled)
1216
1215
  layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
1217
1216
 
1218
1217
  # Process w2 weights
1219
1218
  w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
1219
+ del layer.w2_weight_scale
1220
1220
  layer.w2_blockscale_swizzled.data.copy_(w2_blockscale_swizzled)
1221
1221
  layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
1222
1222
 
@@ -420,20 +420,37 @@ class LoRAManager:
420
420
  ):
421
421
  """Infer LoRA target modules and max_lora_rank from loaded adapters if not provided."""
422
422
 
423
- if target_modules is not None:
424
- self.target_modules = set(target_modules)
425
- else:
426
- self.target_modules = set()
427
- for config in self.configs.values():
428
- if not isinstance(config.target_modules, list):
423
+ self.target_modules = (
424
+ get_normalized_target_modules(target_modules) if target_modules else set()
425
+ )
426
+
427
+ for lora_id, config in self.configs.items():
428
+ if not isinstance(config.target_modules, list):
429
+ raise ValueError(
430
+ f"SGLang currently only supports inferring LoRA target modules when a list of "
431
+ "suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
432
+ "specify `--lora-target-modules` during server startup. You can specify `all` to "
433
+ "enable all support modules types. "
434
+ )
435
+
436
+ adapter_target_modules = get_normalized_target_modules(
437
+ config.target_modules
438
+ )
439
+
440
+ if target_modules is not None:
441
+ # When `--lora-target-modules` is provided, validate adapter target modules is a subset of the specified target modules.
442
+ if not adapter_target_modules.issubset(self.target_modules):
443
+ unsupported_modules = adapter_target_modules - self.target_modules
444
+ lora_name = self.lora_refs[lora_id].lora_name
429
445
  raise ValueError(
430
- f"SGLang currently only supports inferring LoRA target modules when a list of "
431
- "suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
432
- "specify `--lora-target-modules` during server startup. You can specify `all` to "
433
- "enable all support modules types. "
446
+ f"LoRA adapter '{lora_name}' contains target modules {sorted(unsupported_modules)} "
447
+ f"that are not included in the specified --lora-target-modules {sorted(self.target_modules)}. "
448
+ f"Please update --lora-target-modules to include all required modules: "
449
+ f"{sorted(self.target_modules | adapter_target_modules)}, or use 'all' to enable all supported modules."
434
450
  )
435
- self.target_modules.update(config.target_modules)
436
- self.target_modules = get_normalized_target_modules(self.target_modules)
451
+ else:
452
+ # Otherwise, infer target_modules from adapter configs.
453
+ self.target_modules.update(adapter_target_modules)
437
454
 
438
455
  if max_lora_rank is not None:
439
456
  self.max_lora_rank = max_lora_rank
@@ -125,6 +125,14 @@ class SchedulerMetricsMixin:
125
125
  total_queue_latency += req.queue_time_end - req.queue_time_start
126
126
  self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
127
127
 
128
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
129
+ self.stats.num_prefill_prealloc_queue_reqs = len(
130
+ self.disagg_prefill_bootstrap_queue.queue
131
+ )
132
+ self.stats.num_prefill_inflight_queue_reqs = len(
133
+ self.disagg_prefill_inflight_queue
134
+ )
135
+
128
136
  self.metrics_collector.log_stats(self.stats)
129
137
  self._emit_kv_metrics()
130
138
  self._publish_kv_events()
@@ -202,6 +210,13 @@ class SchedulerMetricsMixin:
202
210
  self.stats.spec_accept_length = spec_accept_length
203
211
  self.stats.total_retracted_reqs = self.total_retracted_reqs
204
212
  self.metrics_collector.log_stats(self.stats)
213
+ if self.disaggregation_mode == DisaggregationMode.DECODE:
214
+ self.stats.num_decode_prealloc_queue_reqs = len(
215
+ self.disagg_decode_prealloc_queue.queue
216
+ )
217
+ self.stats.num_decode_transfer_queue_reqs = len(
218
+ self.disagg_decode_transfer_queue.queue
219
+ )
205
220
  self._emit_kv_metrics()
206
221
  self._publish_kv_events()
207
222
 
@@ -142,7 +142,7 @@ class SchedulerStats:
142
142
  spec_accept_length: float = 0.0
143
143
  avg_request_queue_latency: float = 0.0
144
144
  num_prefill_prealloc_queue_reqs: int = 0
145
- num_prefill_infight_queue_reqs: int = 0
145
+ num_prefill_inflight_queue_reqs: int = 0
146
146
  num_decode_prealloc_queue_reqs: int = 0
147
147
  num_decode_transfer_queue_reqs: int = 0
148
148
  total_retracted_reqs: int = 0
@@ -235,9 +235,9 @@ class SchedulerMetricsCollector:
235
235
  multiprocess_mode="mostrecent",
236
236
  )
237
237
 
238
- self.num_prefill_infight_queue_reqs = Gauge(
239
- name="sglang:num_prefill_infight_queue_reqs",
240
- documentation="The number of requests in the prefill infight queue.",
238
+ self.num_prefill_inflight_queue_reqs = Gauge(
239
+ name="sglang:num_prefill_inflight_queue_reqs",
240
+ documentation="The number of requests in the prefill inflight queue.",
241
241
  labelnames=labels.keys(),
242
242
  multiprocess_mode="mostrecent",
243
243
  )
@@ -294,7 +294,7 @@ class SchedulerMetricsCollector:
294
294
  self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
295
295
  )
296
296
  self._log_gauge(
297
- self.num_prefill_infight_queue_reqs, stats.num_prefill_infight_queue_reqs
297
+ self.num_prefill_inflight_queue_reqs, stats.num_prefill_inflight_queue_reqs
298
298
  )
299
299
  self._log_gauge(
300
300
  self.num_decode_prealloc_queue_reqs, stats.num_decode_prealloc_queue_reqs
@@ -54,7 +54,7 @@ from sglang.srt.utils import (
54
54
  empty_context,
55
55
  get_available_gpu_memory,
56
56
  get_device_memory_capacity,
57
- rank0_log,
57
+ log_info_on_rank0,
58
58
  require_attn_tp_gather,
59
59
  require_gathered_buffer,
60
60
  require_mlp_sync,
@@ -267,7 +267,7 @@ class CudaGraphRunner:
267
267
 
268
268
  # Batch sizes to capture
269
269
  self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
270
- rank0_log(f"Capture cuda graph bs {self.capture_bs}")
270
+ log_info_on_rank0(logger, f"Capture cuda graph bs {self.capture_bs}")
271
271
  self.capture_forward_mode = ForwardMode.DECODE
272
272
  self.capture_hidden_mode = CaptureHiddenMode.NULL
273
273
  self.num_tokens_per_bs = 1
sglang/srt/models/grok.py CHANGED
@@ -842,10 +842,6 @@ class Grok1ForCausalLM(nn.Module):
842
842
  if self.is_weights_presharded:
843
843
  setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
844
844
 
845
- default_replicate_lm_head = False
846
- self.replicate_lm_head = getattr(
847
- config, "replicate_lm_head", default_replicate_lm_head
848
- )
849
845
  self.replicate_embedding = getattr(config, "replicate_embedding", False)
850
846
 
851
847
  self.model = Grok1Model(
sglang/srt/offloader.py CHANGED
@@ -321,6 +321,7 @@ class _BaseParamOffloader(ABC):
321
321
  @staticmethod
322
322
  def create(mode: str, **kwargs) -> "_BaseParamOffloader":
323
323
  return {
324
+ "meta": _MetaParamOffloader,
324
325
  "cpu": _CpuParamOffloader,
325
326
  "shm_cpu": _ShmCpuParamOffloader,
326
327
  "sharded_gpu": _ShardedGpuParamOffloader,
@@ -341,6 +342,17 @@ class _BaseParamOffloader(ABC):
341
342
  raise NotImplementedError
342
343
 
343
344
 
345
+ class _MetaParamOffloader(_BaseParamOffloader):
346
+ """Usually used for debugging."""
347
+
348
+ def __init__(self, module, param_name):
349
+ super().__init__(module, param_name)
350
+ _move_param_to_meta(module, param_name)
351
+
352
+ def create_device_tensor(self):
353
+ return torch.empty_like(self._param.data, device="cuda")
354
+
355
+
344
356
  class _CpuParamOffloader(_BaseParamOffloader):
345
357
  def __init__(self, module, param_name):
346
358
  super().__init__(module, param_name)
@@ -431,3 +443,106 @@ def _empty_strided_like(x: torch.Tensor, device, pin_memory=False):
431
443
  device=device,
432
444
  pin_memory=pin_memory,
433
445
  )
446
+
447
+
448
+ # ----------------------------------------- ShardedGpu ------------------------------------------------------
449
+
450
+
451
+ # TODO unify with ShmCpu mode
452
+ class _ShardedGpuParamOffloader(_BaseParamOffloader):
453
+ def __init__(self, module, param_name):
454
+ super().__init__(module, param_name)
455
+ self._rank = get_naive_distributed().get_rank()
456
+ self._world_size = get_naive_distributed().get_world_size()
457
+
458
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
459
+
460
+ assert get_tensor_model_parallel_world_size() == 1, "not yet support tp_size!=1"
461
+ assert (
462
+ self._param.data.is_contiguous()
463
+ ), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
464
+
465
+ if self._rank == 0:
466
+ _move_param_to_cpu(self._param, pin_memory=True)
467
+ else:
468
+ _move_param_to_meta(self._module, self._param_name)
469
+
470
+ self.sharded_param_handles = None
471
+
472
+ def post_init(self):
473
+ # check again since it may be changed
474
+ assert (
475
+ self._param.data.is_contiguous()
476
+ ), f"not yet support non-contiguous tensor {self._param.shape=} {self._param.stride()=}"
477
+
478
+ scatter_src = self._param.data
479
+
480
+ logger.info(
481
+ f"[offloader] post_init {scatter_src.nbytes=} {scatter_src.dtype=} {scatter_src.shape=} {torch.cuda.memory_allocated()=}"
482
+ )
483
+
484
+ if self._rank == 0:
485
+ scatter_src = scatter_src.to("cuda")
486
+ scatter_list = _even_chunk(scatter_src, self._world_size)
487
+
488
+ sharded_param = torch.empty(
489
+ scatter_list[0].shape, dtype=scatter_list[0].dtype, device="cuda"
490
+ )
491
+ self.sharded_param_handles = _create_shared_buffer_tensors(
492
+ local_tensor=sharded_param
493
+ )
494
+
495
+ get_naive_distributed().scatter(
496
+ sharded_param, scatter_list if self._rank == 0 else None
497
+ )
498
+
499
+ _move_param_to_meta(self._module, self._param_name)
500
+
501
+ def create_device_tensor(self):
502
+ output = _empty_strided_like(self._param, device="cuda")
503
+ output_chunks = output.chunk(self._world_size)
504
+
505
+ for index in range(self._world_size):
506
+ src_rank = (self._rank + index) % self._world_size
507
+ src_buf = self.sharded_param_handles[src_rank]
508
+ output_chunks[src_rank].copy_(src_buf)
509
+
510
+ return output
511
+
512
+
513
+ def _even_chunk(x: torch.Tensor, chunks: int):
514
+ assert x.shape[0] % chunks == 0, f"{x.shape=} {chunks=}"
515
+ return list(x.chunk(chunks))
516
+
517
+
518
+ def _create_shared_buffer_tensors(local_tensor: torch.Tensor) -> List[torch.Tensor]:
519
+ self_rank = get_naive_distributed().get_rank()
520
+ world_size = get_naive_distributed().get_world_size()
521
+
522
+ object_list = get_naive_distributed().all_gather_object(
523
+ dict(
524
+ dup_serialized_local_tensor=[
525
+ (
526
+ None
527
+ if interesting_rank == self_rank
528
+ else MultiprocessingSerializer.serialize(local_tensor)
529
+ )
530
+ for interesting_rank in range(world_size)
531
+ ]
532
+ )
533
+ )
534
+
535
+ output_tensors = []
536
+ for output_rank in range(world_size):
537
+ remote_serialized_tensor = object_list[output_rank][
538
+ "dup_serialized_local_tensor"
539
+ ][self_rank]
540
+ if output_rank == self_rank:
541
+ assert remote_serialized_tensor is None
542
+ output_tensors.append(local_tensor)
543
+ else:
544
+ output_tensors.append(
545
+ MultiprocessingSerializer.deserialize(remote_serialized_tensor)
546
+ )
547
+
548
+ return output_tensors
sglang/srt/server_args.py CHANGED
@@ -639,10 +639,6 @@ class ServerArgs:
639
639
  logger.warning(
640
640
  "DeepSeek MTP does not require setting speculative_draft_model_path."
641
641
  )
642
- if self.page_size != 1 and self.attention_backend == "flashinfer":
643
- raise ValueError(
644
- "Speculative decoding with page_size != 1 is not supported. Please set page_size to 1."
645
- )
646
642
 
647
643
  # Auto choose parameters
648
644
  if self.speculative_num_steps is None:
sglang/srt/utils.py CHANGED
@@ -2002,13 +2002,6 @@ def configure_ipv6(dist_init_addr):
2002
2002
  return port, host
2003
2003
 
2004
2004
 
2005
- def rank0_log(msg: str):
2006
- from sglang.srt.distributed import get_tensor_model_parallel_rank
2007
-
2008
- if get_tensor_model_parallel_rank() == 0:
2009
- logger.info(msg)
2010
-
2011
-
2012
2005
  def launch_dummy_health_check_server(host, port, enable_metrics):
2013
2006
  import asyncio
2014
2007
 
@@ -8,6 +8,15 @@ from transformers import AutoConfig
8
8
 
9
9
  from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
10
10
  from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
11
+ from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
12
+
13
+
14
+ # Copy from: https://github.com/deepseek-ai/DeepGEMM/blob/main/deep_gemm/utils.py
15
+ def calc_diff(x, y):
16
+ x, y = x.double(), y.double()
17
+ denominator = (x * x + y * y).sum()
18
+ sim = 2 * (x * y).sum() / denominator
19
+ return 1 - sim
11
20
 
12
21
 
13
22
  def get_model_config(tp_size: int):
@@ -69,16 +78,11 @@ def run_test(tp_size, batch_size, model_config, check=False):
69
78
 
70
79
  # --- Input Data ---
71
80
  # Use bf16/fp16 for input activation based on model config
72
- x = torch.randn((batch_size, H), device="cuda", dtype=dtype) * 0.0001
81
+ x = torch.randn((batch_size, H), device="cuda", dtype=dtype)
73
82
  # --- Weights (Generate in higher precision, then convert to FP8) ---
74
83
  # Generate weights suitable for FP8 conversion (e.g., scaled appropriately)
75
- w1_hp = (
76
- torch.randn((E, I, H), device="cuda", dtype=torch.float32) * 0.00001 + 0.00001
77
- )
78
- w2_hp = (
79
- torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32) * 0.00001
80
- + 0.00001
81
- )
84
+ w1_hp = torch.randn((E, I, H), device="cuda", dtype=torch.float32)
85
+ w2_hp = torch.randn((E, H, I // 2), device="cuda", dtype=torch.float32)
82
86
 
83
87
  w1 = to_fp8(w1_hp)
84
88
  w2 = to_fp8(w2_hp)
@@ -149,13 +153,13 @@ def run_test(tp_size, batch_size, model_config, check=False):
149
153
  )
150
154
 
151
155
  # Note: Triton expects non-transposed weights
156
+ moe_config = MoeRunnerConfig(inplace=False)
152
157
  triton_lambda = lambda: fused_experts(
153
158
  x,
154
159
  w1,
155
160
  w2,
156
161
  (topk_weights, topk_ids, "dummy"),
157
- inplace=False,
158
- activation="silu", # Assuming SiLU activation common in MoEs
162
+ moe_config,
159
163
  use_fp8_w8a8=True,
160
164
  w1_scale=w1_scale,
161
165
  w2_scale=w2_scale,
@@ -221,32 +225,19 @@ def run_test(tp_size, batch_size, model_config, check=False):
221
225
  w1, # Original shape
222
226
  w2, # Original shape
223
227
  (topk_weights, topk_ids, "dummy"),
224
- inplace=False, # Important: Use False to get output tensor
225
- activation="silu",
228
+ moe_config,
226
229
  use_fp8_w8a8=True,
227
230
  w1_scale=w1_scale,
228
231
  w2_scale=w2_scale,
229
232
  block_shape=block_shape,
230
233
  )
231
234
 
232
- # Ensure outputs are same dtype for comparison
233
- y_cutlass = y_cutlass.to(dtype)
234
- y_triton = y_triton.to(dtype)
235
-
236
- abs_error = torch.abs(y_cutlass - y_triton)
237
- rel_error = abs_error / torch.clamp(torch.abs(y_triton), min=1e-2)
238
-
239
- max_abs_err = abs_error.max().item()
240
- max_rel_err = rel_error.max().item()
241
-
242
- print("y_cutlass:", y_cutlass[:, :10])
243
- print("y_triton:", y_triton[:, :10])
244
- print(f"Max absolute error: {max_abs_err:.6f}")
245
- print(f"Max relative error: {max_rel_err:.6f}")
235
+ diff = calc_diff(y_cutlass, y_triton)
236
+ print(f"Diff: {diff:.6f}")
246
237
 
247
238
  # Tolerance might need adjustment based on FP8 specifics and kernel differences
248
239
  # FP8 comparisons often require higher tolerance than FP16/BF16
249
- assert max_rel_err < 5e-1, f"Relative error too high! {max_rel_err}"
240
+ assert diff < 1e-4, f"Diff too high! {diff}"
250
241
  print("Correctness check passed.")
251
242
 
252
243
 
@@ -264,7 +255,21 @@ if __name__ == "__main__":
264
255
  "--batch-sizes",
265
256
  type=int,
266
257
  nargs="+",
267
- default=[1, 4, 8, 16, 32, 64, 128, 256, 512, 1024], # Adjusted default
258
+ default=[
259
+ 1,
260
+ 4,
261
+ 8,
262
+ 16,
263
+ 32,
264
+ 64,
265
+ 128,
266
+ 256,
267
+ 512,
268
+ 1024,
269
+ 2048,
270
+ 4096,
271
+ 8192,
272
+ ], # Adjusted default
268
273
  help="List of batch sizes to test",
269
274
  )
270
275
  parser.add_argument("--check", action="store_true", help="Enable check mode")
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.1"
1
+ __version__ = "0.5.1.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.1
3
+ Version: 0.5.1.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -262,7 +262,7 @@ Requires-Dist: torch==2.8.0; extra == "srt"
262
262
  Requires-Dist: torchaudio==2.8.0; extra == "srt"
263
263
  Requires-Dist: torchvision; extra == "srt"
264
264
  Requires-Dist: cuda-python; extra == "srt"
265
- Requires-Dist: flashinfer_python==0.2.11.post3; extra == "srt"
265
+ Requires-Dist: flashinfer_python==0.2.14.post1; extra == "srt"
266
266
  Provides-Extra: blackwell
267
267
  Requires-Dist: sglang[runtime_common]; extra == "blackwell"
268
268
  Requires-Dist: sgl-kernel; extra == "blackwell"
@@ -270,7 +270,7 @@ Requires-Dist: torch==2.8.0; extra == "blackwell"
270
270
  Requires-Dist: torchaudio==2.8.0; extra == "blackwell"
271
271
  Requires-Dist: torchvision; extra == "blackwell"
272
272
  Requires-Dist: cuda-python; extra == "blackwell"
273
- Requires-Dist: flashinfer_python==0.2.11.post3; extra == "blackwell"
273
+ Requires-Dist: flashinfer_python==0.2.14.post1; extra == "blackwell"
274
274
  Provides-Extra: srt-hip
275
275
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
276
276
  Requires-Dist: torch; extra == "srt-hip"
@@ -374,7 +374,7 @@ Dynamic: license-file
374
374
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
375
375
 
376
376
  ## News
377
- - [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking. [Register here](https://lu.ma/gbfhjvuo).
377
+ - [2025/08] 🔔 SGLang x AMD SF Meetup on 8/22: Hands-on GPU workshop, tech talks by AMD/xAI/SGLang, and networking ([Roadmap](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_roadmap.pdf), [Large-scale EP](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_meetup_sglang_ep.pdf)).
378
378
  - [2025/08] 🔥 SGLang provides day-0 support for OpenAI gpt-oss model ([instructions](https://github.com/sgl-project/sglang/issues/8833))
379
379
  - [2025/06] 🔥 SGLang, the high-performance serving infrastructure powering trillions of tokens daily, has been awarded the third batch of the Open Source AI Grant by a16z ([a16z blog](https://a16z.com/advancing-open-source-ai-through-benchmarks-and-bold-experimentation/)).
380
380
  - [2025/06] 🔥 Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP (Part I): 2.7x Higher Decoding Throughput ([blog](https://lmsys.org/blog/2025-06-16-gb200-part-1/)).