sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. sglang/lang/interpreter.py +1 -1
  2. sglang/srt/configs/internvl.py +6 -0
  3. sglang/srt/configs/model_config.py +2 -1
  4. sglang/srt/disaggregation/mini_lb.py +2 -2
  5. sglang/srt/distributed/parallel_state.py +46 -41
  6. sglang/srt/entrypoints/engine.py +1 -1
  7. sglang/srt/entrypoints/http_server.py +5 -1
  8. sglang/srt/entrypoints/openai/protocol.py +3 -3
  9. sglang/srt/entrypoints/openai/serving_chat.py +3 -3
  10. sglang/srt/entrypoints/openai/serving_completions.py +3 -1
  11. sglang/srt/entrypoints/openai/serving_embedding.py +1 -1
  12. sglang/srt/entrypoints/openai/serving_responses.py +1 -1
  13. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  14. sglang/srt/layers/attention/aiter_backend.py +93 -68
  15. sglang/srt/layers/communicator.py +45 -7
  16. sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
  17. sglang/srt/layers/moe/ep_moe/layer.py +2 -7
  18. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  19. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  20. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
  21. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  22. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
  23. sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
  24. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  25. sglang/srt/layers/moe/utils.py +0 -1
  26. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
  27. sglang/srt/layers/quantization/modelopt_quant.py +35 -2
  28. sglang/srt/layers/quantization/mxfp4.py +4 -1
  29. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  30. sglang/srt/layers/quantization/quark/utils.py +97 -0
  31. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  32. sglang/srt/layers/quantization/w4afp8.py +30 -25
  33. sglang/srt/layers/rocm_linear_utils.py +44 -0
  34. sglang/srt/layers/rotary_embedding.py +0 -18
  35. sglang/srt/managers/cache_controller.py +42 -39
  36. sglang/srt/managers/detokenizer_manager.py +0 -34
  37. sglang/srt/managers/multi_tokenizer_mixin.py +48 -6
  38. sglang/srt/managers/schedule_policy.py +3 -2
  39. sglang/srt/managers/scheduler.py +7 -100
  40. sglang/srt/managers/scheduler_metrics_mixin.py +113 -7
  41. sglang/srt/managers/template_manager.py +3 -3
  42. sglang/srt/managers/tokenizer_manager.py +1 -0
  43. sglang/srt/mem_cache/allocator.py +1 -1
  44. sglang/srt/mem_cache/hicache_storage.py +15 -10
  45. sglang/srt/mem_cache/hiradix_cache.py +16 -0
  46. sglang/srt/mem_cache/memory_pool_host.py +18 -11
  47. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  48. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +35 -6
  49. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +32 -13
  50. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  51. sglang/srt/metrics/collector.py +12 -4
  52. sglang/srt/metrics/utils.py +48 -0
  53. sglang/srt/model_executor/forward_batch_info.py +16 -17
  54. sglang/srt/model_executor/model_runner.py +1 -1
  55. sglang/srt/models/deepseek_v2.py +245 -36
  56. sglang/srt/models/glm4_moe.py +10 -1
  57. sglang/srt/models/gpt_oss.py +5 -4
  58. sglang/srt/models/internvl.py +28 -0
  59. sglang/srt/models/longcat_flash.py +26 -15
  60. sglang/srt/models/longcat_flash_nextn.py +23 -15
  61. sglang/srt/models/minicpmv.py +165 -3
  62. sglang/srt/models/qwen2_moe.py +4 -1
  63. sglang/srt/models/qwen3.py +8 -2
  64. sglang/srt/models/qwen3_moe.py +39 -8
  65. sglang/srt/models/torch_native_llama.py +1 -1
  66. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  67. sglang/srt/server_args.py +79 -2
  68. sglang/srt/speculative/eagle_worker.py +158 -112
  69. sglang/srt/utils.py +12 -10
  70. sglang/test/few_shot_gsm8k.py +1 -0
  71. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  72. sglang/utils.py +1 -0
  73. sglang/version.py +1 -1
  74. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/METADATA +2 -2
  75. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/RECORD +83 -76
  76. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  77. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  78. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  79. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  80. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  81. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  82. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/WHEEL +0 -0
  83. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/licenses/LICENSE +0 -0
  84. {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/top_level.txt +0 -0
@@ -187,137 +187,183 @@ class EAGLEWorker(TpModelWorker):
187
187
  self.has_prefill_wrapper_verify = False
188
188
  self.draft_extend_attn_backend = None
189
189
 
190
- if self.server_args.attention_backend == "flashinfer":
191
- if not global_server_args_dict["use_mla_backend"]:
192
- from sglang.srt.layers.attention.flashinfer_backend import (
193
- FlashInferAttnBackend,
194
- FlashInferMultiStepDraftBackend,
195
- )
190
+ # Initialize decode attention backend
191
+ self.draft_attn_backend = self._create_decode_backend()
196
192
 
197
- self.draft_attn_backend = FlashInferMultiStepDraftBackend(
198
- self.draft_model_runner,
199
- self.topk,
200
- self.speculative_num_steps,
201
- )
202
- self.draft_extend_attn_backend = FlashInferAttnBackend(
203
- self.draft_model_runner,
204
- skip_prefill=False,
205
- )
206
- else:
207
- from sglang.srt.layers.attention.flashinfer_mla_backend import (
208
- FlashInferMLAAttnBackend,
209
- FlashInferMLAMultiStepDraftBackend,
210
- )
193
+ # Initialize prefill attention backend
194
+ self.draft_extend_attn_backend = self._create_draft_extend_backend()
211
195
 
212
- self.draft_attn_backend = FlashInferMLAMultiStepDraftBackend(
213
- self.draft_model_runner,
214
- self.topk,
215
- self.speculative_num_steps,
216
- )
217
- self.draft_extend_attn_backend = FlashInferMLAAttnBackend(
218
- self.draft_model_runner,
219
- skip_prefill=False,
220
- )
221
- self.has_prefill_wrapper_verify = True
222
- elif self.server_args.attention_backend == "triton":
223
- from sglang.srt.layers.attention.triton_backend import (
224
- TritonAttnBackend,
225
- TritonMultiStepDraftBackend,
226
- )
196
+ self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
227
197
 
228
- self.draft_attn_backend = TritonMultiStepDraftBackend(
229
- self.draft_model_runner,
230
- self.topk,
231
- self.speculative_num_steps,
232
- )
233
- self.draft_extend_attn_backend = TritonAttnBackend(
234
- self.draft_model_runner,
235
- skip_prefill=False,
236
- )
237
- elif self.server_args.attention_backend == "aiter":
238
- from sglang.srt.layers.attention.aiter_backend import (
239
- AiterAttnBackend,
240
- AiterMultiStepDraftBackend,
241
- )
198
+ def _create_backend(
199
+ self, backend_name: str, backend_map: dict, error_template: str
200
+ ):
201
+ backend_type = getattr(self.server_args, backend_name)
202
+ if backend_type is None:
203
+ backend_type = self.server_args.attention_backend
204
+
205
+ if backend_type not in backend_map:
206
+ raise ValueError(error_template.format(backend_type=backend_type))
207
+
208
+ return backend_map[backend_type]()
209
+
210
+ def _create_decode_backend(self):
211
+ backend_map = {
212
+ "flashinfer": self._create_flashinfer_decode_backend,
213
+ "triton": self._create_triton_decode_backend,
214
+ "aiter": self._create_aiter_decode_backend,
215
+ "fa3": self._create_fa3_decode_backend,
216
+ "flashmla": self._create_flashmla_decode_backend,
217
+ "trtllm_mha": self._create_trtllm_mha_decode_backend,
218
+ "trtllm_mla": self._create_trtllm_mla_decode_backend,
219
+ }
220
+
221
+ return self._create_backend(
222
+ "decode_attention_backend",
223
+ backend_map,
224
+ "EAGLE is not supported in decode attention backend {backend_type}",
225
+ )
242
226
 
243
- self.draft_attn_backend = AiterMultiStepDraftBackend(
244
- self.draft_model_runner,
245
- self.topk,
246
- self.speculative_num_steps,
247
- )
248
- self.draft_extend_attn_backend = AiterAttnBackend(
249
- self.draft_model_runner,
250
- skip_prefill=False,
251
- )
252
- self.has_prefill_wrapper_verify = False
253
- elif self.server_args.attention_backend == "fa3":
254
- from sglang.srt.layers.attention.flashattention_backend import (
255
- FlashAttentionBackend,
256
- FlashAttentionMultiStepBackend,
257
- )
227
+ def _create_draft_extend_backend(self):
228
+ backend_map = {
229
+ "flashinfer": self._create_flashinfer_prefill_backend,
230
+ "triton": self._create_triton_prefill_backend,
231
+ "aiter": self._create_aiter_prefill_backend,
232
+ "fa3": self._create_fa3_prefill_backend,
233
+ "trtllm_mha": self._create_trtllm_mha_prefill_backend,
234
+ "trtllm_mla": self._create_trtllm_mla_prefill_backend,
235
+ }
236
+
237
+ return self._create_backend(
238
+ "prefill_attention_backend",
239
+ backend_map,
240
+ "EAGLE is not supported in prefill attention backend {backend_type}",
241
+ )
258
242
 
259
- self.draft_attn_backend = FlashAttentionMultiStepBackend(
260
- self.draft_model_runner,
261
- self.topk,
262
- self.speculative_num_steps,
263
- )
264
- self.draft_extend_attn_backend = FlashAttentionBackend(
265
- self.draft_model_runner,
266
- skip_prefill=False,
267
- )
268
- elif self.server_args.attention_backend == "flashmla":
269
- from sglang.srt.layers.attention.flashmla_backend import (
270
- FlashMLAMultiStepDraftBackend,
243
+ def _create_flashinfer_decode_backend(self):
244
+ if not global_server_args_dict["use_mla_backend"]:
245
+ from sglang.srt.layers.attention.flashinfer_backend import (
246
+ FlashInferMultiStepDraftBackend,
271
247
  )
272
248
 
273
- self.draft_attn_backend = FlashMLAMultiStepDraftBackend(
274
- self.draft_model_runner,
275
- self.topk,
276
- self.speculative_num_steps,
249
+ self.has_prefill_wrapper_verify = True
250
+ return FlashInferMultiStepDraftBackend(
251
+ self.draft_model_runner, self.topk, self.speculative_num_steps
277
252
  )
278
- elif self.server_args.attention_backend == "trtllm_mha":
279
- from sglang.srt.layers.attention.trtllm_mha_backend import (
280
- TRTLLMHAAttnBackend,
281
- TRTLLMHAAttnMultiStepDraftBackend,
253
+ else:
254
+ from sglang.srt.layers.attention.flashinfer_mla_backend import (
255
+ FlashInferMLAMultiStepDraftBackend,
282
256
  )
283
257
 
284
- self.draft_attn_backend = TRTLLMHAAttnMultiStepDraftBackend(
285
- self.draft_model_runner,
286
- self.topk,
287
- self.speculative_num_steps,
288
- )
289
- self.draft_extend_attn_backend = TRTLLMHAAttnBackend(
290
- self.draft_model_runner,
291
- skip_prefill=False,
292
- )
293
258
  self.has_prefill_wrapper_verify = True
294
- elif self.server_args.attention_backend == "trtllm_mla":
295
- if not global_server_args_dict["use_mla_backend"]:
296
- raise ValueError(
297
- "trtllm_mla backend requires MLA model (use_mla_backend=True)."
298
- )
299
-
300
- from sglang.srt.layers.attention.trtllm_mla_backend import (
301
- TRTLLMMLABackend,
302
- TRTLLMMLAMultiStepDraftBackend,
259
+ return FlashInferMLAMultiStepDraftBackend(
260
+ self.draft_model_runner, self.topk, self.speculative_num_steps
303
261
  )
304
262
 
305
- self.draft_attn_backend = TRTLLMMLAMultiStepDraftBackend(
306
- self.draft_model_runner,
307
- self.topk,
308
- self.speculative_num_steps,
263
+ def _create_triton_decode_backend(self):
264
+ from sglang.srt.layers.attention.triton_backend import (
265
+ TritonMultiStepDraftBackend,
266
+ )
267
+
268
+ return TritonMultiStepDraftBackend(
269
+ self.draft_model_runner, self.topk, self.speculative_num_steps
270
+ )
271
+
272
+ def _create_aiter_decode_backend(self):
273
+ from sglang.srt.layers.attention.aiter_backend import AiterMultiStepDraftBackend
274
+
275
+ return AiterMultiStepDraftBackend(
276
+ self.draft_model_runner, self.topk, self.speculative_num_steps
277
+ )
278
+
279
+ def _create_fa3_decode_backend(self):
280
+ from sglang.srt.layers.attention.flashattention_backend import (
281
+ FlashAttentionMultiStepBackend,
282
+ )
283
+
284
+ return FlashAttentionMultiStepBackend(
285
+ self.draft_model_runner, self.topk, self.speculative_num_steps
286
+ )
287
+
288
+ def _create_flashmla_decode_backend(self):
289
+ from sglang.srt.layers.attention.flashmla_backend import (
290
+ FlashMLAMultiStepDraftBackend,
291
+ )
292
+
293
+ return FlashMLAMultiStepDraftBackend(
294
+ self.draft_model_runner, self.topk, self.speculative_num_steps
295
+ )
296
+
297
+ def _create_trtllm_mha_decode_backend(self):
298
+ from sglang.srt.layers.attention.trtllm_mha_backend import (
299
+ TRTLLMHAAttnMultiStepDraftBackend,
300
+ )
301
+
302
+ self.has_prefill_wrapper_verify = True
303
+ return TRTLLMHAAttnMultiStepDraftBackend(
304
+ self.draft_model_runner, self.topk, self.speculative_num_steps
305
+ )
306
+
307
+ def _create_trtllm_mla_decode_backend(self):
308
+ if not global_server_args_dict["use_mla_backend"]:
309
+ raise ValueError(
310
+ "trtllm_mla backend requires MLA model (use_mla_backend=True)."
309
311
  )
310
- self.draft_extend_attn_backend = TRTLLMMLABackend(
311
- self.draft_model_runner,
312
- skip_prefill=False,
312
+
313
+ from sglang.srt.layers.attention.trtllm_mla_backend import (
314
+ TRTLLMMLAMultiStepDraftBackend,
315
+ )
316
+
317
+ self.has_prefill_wrapper_verify = True
318
+ return TRTLLMMLAMultiStepDraftBackend(
319
+ self.draft_model_runner, self.topk, self.speculative_num_steps
320
+ )
321
+
322
+ def _create_flashinfer_prefill_backend(self):
323
+ if not global_server_args_dict["use_mla_backend"]:
324
+ from sglang.srt.layers.attention.flashinfer_backend import (
325
+ FlashInferAttnBackend,
313
326
  )
314
- self.has_prefill_wrapper_verify = True
327
+
328
+ return FlashInferAttnBackend(self.draft_model_runner, skip_prefill=False)
315
329
  else:
330
+ from sglang.srt.layers.attention.flashinfer_mla_backend import (
331
+ FlashInferMLAAttnBackend,
332
+ )
333
+
334
+ return FlashInferMLAAttnBackend(self.draft_model_runner, skip_prefill=False)
335
+
336
+ def _create_triton_prefill_backend(self):
337
+ from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
338
+
339
+ return TritonAttnBackend(self.draft_model_runner, skip_prefill=False)
340
+
341
+ def _create_aiter_prefill_backend(self):
342
+ from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
343
+
344
+ return AiterAttnBackend(self.draft_model_runner, skip_prefill=False)
345
+
346
+ def _create_fa3_prefill_backend(self):
347
+ from sglang.srt.layers.attention.flashattention_backend import (
348
+ FlashAttentionBackend,
349
+ )
350
+
351
+ return FlashAttentionBackend(self.draft_model_runner, skip_prefill=False)
352
+
353
+ def _create_trtllm_mha_prefill_backend(self):
354
+ from sglang.srt.layers.attention.trtllm_mha_backend import TRTLLMHAAttnBackend
355
+
356
+ return TRTLLMHAAttnBackend(self.draft_model_runner, skip_prefill=False)
357
+
358
+ def _create_trtllm_mla_prefill_backend(self):
359
+ if not global_server_args_dict["use_mla_backend"]:
316
360
  raise ValueError(
317
- f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
361
+ "trtllm_mla backend requires MLA model (use_mla_backend=True)."
318
362
  )
319
363
 
320
- self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
364
+ from sglang.srt.layers.attention.trtllm_mla_backend import TRTLLMMLABackend
365
+
366
+ return TRTLLMMLABackend(self.draft_model_runner, skip_prefill=False)
321
367
 
322
368
  def init_cuda_graphs(self):
323
369
  """Capture cuda graphs."""
sglang/srt/utils.py CHANGED
@@ -2787,16 +2787,6 @@ def lru_cache_frozenset(maxsize=128):
2787
2787
  return decorator
2788
2788
 
2789
2789
 
2790
- def get_worker_ids_from_req_rids(rids):
2791
- if isinstance(rids, list):
2792
- worker_ids = [int(rid.split("_")[0]) for rid in rids]
2793
- elif isinstance(rids, str):
2794
- worker_ids = [int(rids.split("_")[0])]
2795
- else:
2796
- worker_ids = []
2797
- return worker_ids
2798
-
2799
-
2800
2790
  def get_origin_rid(rid):
2801
2791
  return rid.split("_", 1)[1] if "_" in rid else rid
2802
2792
 
@@ -2910,6 +2900,18 @@ def mxfp_supported():
2910
2900
  return False
2911
2901
 
2912
2902
 
2903
+ @lru_cache(maxsize=1)
2904
+ def is_gfx95_supported():
2905
+ """
2906
+ Returns whether the current platform supports MX types.
2907
+ """
2908
+ if torch.version.hip:
2909
+ gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
2910
+ return any(gfx in gcn_arch for gfx in ["gfx95"])
2911
+ else:
2912
+ return False
2913
+
2914
+
2913
2915
  # LoRA-related constants and utilities
2914
2916
  SUPPORTED_LORA_TARGET_MODULES = [
2915
2917
  "q_proj",
@@ -129,6 +129,7 @@ def run_eval(args):
129
129
 
130
130
  return {
131
131
  "accuracy": acc,
132
+ "invalid": invalid,
132
133
  "latency": latency,
133
134
  "output_throughput": output_throughput,
134
135
  }
@@ -1,6 +1,6 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
 
3
- from typing import Optional
3
+ from typing import Literal, Optional
4
4
 
5
5
  import pytest
6
6
  import torch
@@ -25,7 +25,7 @@ def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Ten
25
25
  return packed_tensor.to(torch.int8)
26
26
 
27
27
 
28
- def pack_interleave(num_experts, ref_weight, ref_scale):
28
+ def pack_interleave(num_experts, ref_weight, ref_scale, alignment=4):
29
29
  n, k = ref_weight.shape[1], ref_weight.shape[2]
30
30
 
31
31
  weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
@@ -33,11 +33,16 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
33
33
  w_q = w_q.contiguous()
34
34
 
35
35
  scale_interleaved = ref_scale.reshape(
36
- ref_scale.shape[0], ref_scale.shape[1], (ref_scale.shape[2] // 4), 4
36
+ ref_scale.shape[0],
37
+ ref_scale.shape[1],
38
+ (ref_scale.shape[2] // alignment),
39
+ alignment,
37
40
  ) # [E, N, K/4, 4]
38
41
  scale_interleaved = scale_interleaved.permute(0, 2, 1, 3) # [E, K/4, N, 4]
39
42
  scale_interleaved = scale_interleaved.reshape(
40
- ref_scale.shape[0], ref_scale.shape[2] // 4, ref_scale.shape[1] * 4
43
+ ref_scale.shape[0],
44
+ ref_scale.shape[2] // alignment,
45
+ ref_scale.shape[1] * alignment,
41
46
  ) # [E, K/4, N*4]
42
47
  w_scale = scale_interleaved.contiguous()
43
48
 
@@ -48,12 +53,17 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
48
53
  @pytest.mark.parametrize("N", [2048])
49
54
  @pytest.mark.parametrize("K", [7168])
50
55
  @pytest.mark.parametrize("E", [256])
51
- @pytest.mark.parametrize("ep_size", [8])
56
+ @pytest.mark.parametrize("tp_size", [8])
57
+ @pytest.mark.parametrize("use_ep_moe", [True, False])
52
58
  @pytest.mark.parametrize("topk", [8])
53
59
  @pytest.mark.parametrize("group_size", [128])
54
60
  @pytest.mark.parametrize("dtype", [torch.bfloat16])
55
- def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
56
- local_e = E // ep_size
61
+ def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dtype):
62
+ if use_ep_moe:
63
+ local_e = E // tp_size
64
+ else: # tp mode
65
+ local_e = E
66
+ N = N // tp_size
57
67
 
58
68
  debug = False
59
69
  if debug:
@@ -87,7 +97,10 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
87
97
  )
88
98
 
89
99
  w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
90
- w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
100
+ if use_ep_moe:
101
+ w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
102
+ else:
103
+ w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2, 1)
91
104
 
92
105
  device = "cuda"
93
106
  a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
@@ -265,7 +278,9 @@ def ref(
265
278
 
266
279
  gate, fc1 = fc1.chunk(2, dim=-1)
267
280
  fc1 = fc1 * torch.nn.functional.silu(gate)
268
- act = (fc1 / pre_quant_scale_2.float()).to(torch.float8_e4m3fn)
281
+ act = torch.clamp((fc1 / pre_quant_scale_2.float()), -448.0, 448.0).to(
282
+ torch.float8_e4m3fn
283
+ )
269
284
  act = act.to(dtype)
270
285
 
271
286
  w2 = ref_weight_2[e_idx]
sglang/utils.py CHANGED
@@ -457,6 +457,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
457
457
  NOTE: Typically, the server runs in a separate terminal.
458
458
  In this notebook, we run the server and notebook code together, so their outputs are combined.
459
459
  To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
460
+ To reduce the log length, we set the log level to warning for the server, the default log level is info.
460
461
  We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
461
462
  """
462
463
  )
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.2rc0"
1
+ __version__ = "0.5.2rc2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sglang
3
- Version: 0.5.2rc0
3
+ Version: 0.5.2rc2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -257,7 +257,7 @@ Requires-Dist: uvloop; extra == "runtime-common"
257
257
  Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
258
258
  Provides-Extra: srt
259
259
  Requires-Dist: sglang[runtime_common]; extra == "srt"
260
- Requires-Dist: sgl-kernel==0.3.7.post1; extra == "srt"
260
+ Requires-Dist: sgl-kernel==0.3.8; extra == "srt"
261
261
  Requires-Dist: torch==2.8.0; extra == "srt"
262
262
  Requires-Dist: torchaudio==2.8.0; extra == "srt"
263
263
  Requires-Dist: torchvision; extra == "srt"