sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/model_config.py +2 -1
- sglang/srt/disaggregation/mini_lb.py +2 -2
- sglang/srt/distributed/parallel_state.py +46 -41
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/http_server.py +5 -1
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +3 -3
- sglang/srt/entrypoints/openai/serving_completions.py +3 -1
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -1
- sglang/srt/entrypoints/openai/serving_responses.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
- sglang/srt/layers/moe/ep_moe/layer.py +2 -7
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/utils.py +0 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
- sglang/srt/layers/quantization/modelopt_quant.py +35 -2
- sglang/srt/layers/quantization/mxfp4.py +4 -1
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +30 -25
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +0 -18
- sglang/srt/managers/cache_controller.py +42 -39
- sglang/srt/managers/detokenizer_manager.py +0 -34
- sglang/srt/managers/multi_tokenizer_mixin.py +48 -6
- sglang/srt/managers/schedule_policy.py +3 -2
- sglang/srt/managers/scheduler.py +7 -100
- sglang/srt/managers/scheduler_metrics_mixin.py +113 -7
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_manager.py +1 -0
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +15 -10
- sglang/srt/mem_cache/hiradix_cache.py +16 -0
- sglang/srt/mem_cache/memory_pool_host.py +18 -11
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +35 -6
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +32 -13
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/metrics/collector.py +12 -4
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/forward_batch_info.py +16 -17
- sglang/srt/model_executor/model_runner.py +1 -1
- sglang/srt/models/deepseek_v2.py +245 -36
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/gpt_oss.py +5 -4
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/longcat_flash.py +26 -15
- sglang/srt/models/longcat_flash_nextn.py +23 -15
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/qwen2_moe.py +4 -1
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/server_args.py +79 -2
- sglang/srt/speculative/eagle_worker.py +158 -112
- sglang/srt/utils.py +12 -10
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/utils.py +1 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/METADATA +2 -2
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/RECORD +83 -76
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc2.dist-info}/top_level.txt +0 -0
@@ -187,137 +187,183 @@ class EAGLEWorker(TpModelWorker):
|
|
187
187
|
self.has_prefill_wrapper_verify = False
|
188
188
|
self.draft_extend_attn_backend = None
|
189
189
|
|
190
|
-
|
191
|
-
|
192
|
-
from sglang.srt.layers.attention.flashinfer_backend import (
|
193
|
-
FlashInferAttnBackend,
|
194
|
-
FlashInferMultiStepDraftBackend,
|
195
|
-
)
|
190
|
+
# Initialize decode attention backend
|
191
|
+
self.draft_attn_backend = self._create_decode_backend()
|
196
192
|
|
197
|
-
|
198
|
-
|
199
|
-
self.topk,
|
200
|
-
self.speculative_num_steps,
|
201
|
-
)
|
202
|
-
self.draft_extend_attn_backend = FlashInferAttnBackend(
|
203
|
-
self.draft_model_runner,
|
204
|
-
skip_prefill=False,
|
205
|
-
)
|
206
|
-
else:
|
207
|
-
from sglang.srt.layers.attention.flashinfer_mla_backend import (
|
208
|
-
FlashInferMLAAttnBackend,
|
209
|
-
FlashInferMLAMultiStepDraftBackend,
|
210
|
-
)
|
193
|
+
# Initialize prefill attention backend
|
194
|
+
self.draft_extend_attn_backend = self._create_draft_extend_backend()
|
211
195
|
|
212
|
-
|
213
|
-
self.draft_model_runner,
|
214
|
-
self.topk,
|
215
|
-
self.speculative_num_steps,
|
216
|
-
)
|
217
|
-
self.draft_extend_attn_backend = FlashInferMLAAttnBackend(
|
218
|
-
self.draft_model_runner,
|
219
|
-
skip_prefill=False,
|
220
|
-
)
|
221
|
-
self.has_prefill_wrapper_verify = True
|
222
|
-
elif self.server_args.attention_backend == "triton":
|
223
|
-
from sglang.srt.layers.attention.triton_backend import (
|
224
|
-
TritonAttnBackend,
|
225
|
-
TritonMultiStepDraftBackend,
|
226
|
-
)
|
196
|
+
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
|
227
197
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
)
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
198
|
+
def _create_backend(
|
199
|
+
self, backend_name: str, backend_map: dict, error_template: str
|
200
|
+
):
|
201
|
+
backend_type = getattr(self.server_args, backend_name)
|
202
|
+
if backend_type is None:
|
203
|
+
backend_type = self.server_args.attention_backend
|
204
|
+
|
205
|
+
if backend_type not in backend_map:
|
206
|
+
raise ValueError(error_template.format(backend_type=backend_type))
|
207
|
+
|
208
|
+
return backend_map[backend_type]()
|
209
|
+
|
210
|
+
def _create_decode_backend(self):
|
211
|
+
backend_map = {
|
212
|
+
"flashinfer": self._create_flashinfer_decode_backend,
|
213
|
+
"triton": self._create_triton_decode_backend,
|
214
|
+
"aiter": self._create_aiter_decode_backend,
|
215
|
+
"fa3": self._create_fa3_decode_backend,
|
216
|
+
"flashmla": self._create_flashmla_decode_backend,
|
217
|
+
"trtllm_mha": self._create_trtllm_mha_decode_backend,
|
218
|
+
"trtllm_mla": self._create_trtllm_mla_decode_backend,
|
219
|
+
}
|
220
|
+
|
221
|
+
return self._create_backend(
|
222
|
+
"decode_attention_backend",
|
223
|
+
backend_map,
|
224
|
+
"EAGLE is not supported in decode attention backend {backend_type}",
|
225
|
+
)
|
242
226
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
self.
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
227
|
+
def _create_draft_extend_backend(self):
|
228
|
+
backend_map = {
|
229
|
+
"flashinfer": self._create_flashinfer_prefill_backend,
|
230
|
+
"triton": self._create_triton_prefill_backend,
|
231
|
+
"aiter": self._create_aiter_prefill_backend,
|
232
|
+
"fa3": self._create_fa3_prefill_backend,
|
233
|
+
"trtllm_mha": self._create_trtllm_mha_prefill_backend,
|
234
|
+
"trtllm_mla": self._create_trtllm_mla_prefill_backend,
|
235
|
+
}
|
236
|
+
|
237
|
+
return self._create_backend(
|
238
|
+
"prefill_attention_backend",
|
239
|
+
backend_map,
|
240
|
+
"EAGLE is not supported in prefill attention backend {backend_type}",
|
241
|
+
)
|
258
242
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
)
|
264
|
-
self.draft_extend_attn_backend = FlashAttentionBackend(
|
265
|
-
self.draft_model_runner,
|
266
|
-
skip_prefill=False,
|
267
|
-
)
|
268
|
-
elif self.server_args.attention_backend == "flashmla":
|
269
|
-
from sglang.srt.layers.attention.flashmla_backend import (
|
270
|
-
FlashMLAMultiStepDraftBackend,
|
243
|
+
def _create_flashinfer_decode_backend(self):
|
244
|
+
if not global_server_args_dict["use_mla_backend"]:
|
245
|
+
from sglang.srt.layers.attention.flashinfer_backend import (
|
246
|
+
FlashInferMultiStepDraftBackend,
|
271
247
|
)
|
272
248
|
|
273
|
-
self.
|
274
|
-
|
275
|
-
self.topk,
|
276
|
-
self.speculative_num_steps,
|
249
|
+
self.has_prefill_wrapper_verify = True
|
250
|
+
return FlashInferMultiStepDraftBackend(
|
251
|
+
self.draft_model_runner, self.topk, self.speculative_num_steps
|
277
252
|
)
|
278
|
-
|
279
|
-
from sglang.srt.layers.attention.
|
280
|
-
|
281
|
-
TRTLLMHAAttnMultiStepDraftBackend,
|
253
|
+
else:
|
254
|
+
from sglang.srt.layers.attention.flashinfer_mla_backend import (
|
255
|
+
FlashInferMLAMultiStepDraftBackend,
|
282
256
|
)
|
283
257
|
|
284
|
-
self.draft_attn_backend = TRTLLMHAAttnMultiStepDraftBackend(
|
285
|
-
self.draft_model_runner,
|
286
|
-
self.topk,
|
287
|
-
self.speculative_num_steps,
|
288
|
-
)
|
289
|
-
self.draft_extend_attn_backend = TRTLLMHAAttnBackend(
|
290
|
-
self.draft_model_runner,
|
291
|
-
skip_prefill=False,
|
292
|
-
)
|
293
258
|
self.has_prefill_wrapper_verify = True
|
294
|
-
|
295
|
-
|
296
|
-
raise ValueError(
|
297
|
-
"trtllm_mla backend requires MLA model (use_mla_backend=True)."
|
298
|
-
)
|
299
|
-
|
300
|
-
from sglang.srt.layers.attention.trtllm_mla_backend import (
|
301
|
-
TRTLLMMLABackend,
|
302
|
-
TRTLLMMLAMultiStepDraftBackend,
|
259
|
+
return FlashInferMLAMultiStepDraftBackend(
|
260
|
+
self.draft_model_runner, self.topk, self.speculative_num_steps
|
303
261
|
)
|
304
262
|
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
263
|
+
def _create_triton_decode_backend(self):
|
264
|
+
from sglang.srt.layers.attention.triton_backend import (
|
265
|
+
TritonMultiStepDraftBackend,
|
266
|
+
)
|
267
|
+
|
268
|
+
return TritonMultiStepDraftBackend(
|
269
|
+
self.draft_model_runner, self.topk, self.speculative_num_steps
|
270
|
+
)
|
271
|
+
|
272
|
+
def _create_aiter_decode_backend(self):
|
273
|
+
from sglang.srt.layers.attention.aiter_backend import AiterMultiStepDraftBackend
|
274
|
+
|
275
|
+
return AiterMultiStepDraftBackend(
|
276
|
+
self.draft_model_runner, self.topk, self.speculative_num_steps
|
277
|
+
)
|
278
|
+
|
279
|
+
def _create_fa3_decode_backend(self):
|
280
|
+
from sglang.srt.layers.attention.flashattention_backend import (
|
281
|
+
FlashAttentionMultiStepBackend,
|
282
|
+
)
|
283
|
+
|
284
|
+
return FlashAttentionMultiStepBackend(
|
285
|
+
self.draft_model_runner, self.topk, self.speculative_num_steps
|
286
|
+
)
|
287
|
+
|
288
|
+
def _create_flashmla_decode_backend(self):
|
289
|
+
from sglang.srt.layers.attention.flashmla_backend import (
|
290
|
+
FlashMLAMultiStepDraftBackend,
|
291
|
+
)
|
292
|
+
|
293
|
+
return FlashMLAMultiStepDraftBackend(
|
294
|
+
self.draft_model_runner, self.topk, self.speculative_num_steps
|
295
|
+
)
|
296
|
+
|
297
|
+
def _create_trtllm_mha_decode_backend(self):
|
298
|
+
from sglang.srt.layers.attention.trtllm_mha_backend import (
|
299
|
+
TRTLLMHAAttnMultiStepDraftBackend,
|
300
|
+
)
|
301
|
+
|
302
|
+
self.has_prefill_wrapper_verify = True
|
303
|
+
return TRTLLMHAAttnMultiStepDraftBackend(
|
304
|
+
self.draft_model_runner, self.topk, self.speculative_num_steps
|
305
|
+
)
|
306
|
+
|
307
|
+
def _create_trtllm_mla_decode_backend(self):
|
308
|
+
if not global_server_args_dict["use_mla_backend"]:
|
309
|
+
raise ValueError(
|
310
|
+
"trtllm_mla backend requires MLA model (use_mla_backend=True)."
|
309
311
|
)
|
310
|
-
|
311
|
-
|
312
|
-
|
312
|
+
|
313
|
+
from sglang.srt.layers.attention.trtllm_mla_backend import (
|
314
|
+
TRTLLMMLAMultiStepDraftBackend,
|
315
|
+
)
|
316
|
+
|
317
|
+
self.has_prefill_wrapper_verify = True
|
318
|
+
return TRTLLMMLAMultiStepDraftBackend(
|
319
|
+
self.draft_model_runner, self.topk, self.speculative_num_steps
|
320
|
+
)
|
321
|
+
|
322
|
+
def _create_flashinfer_prefill_backend(self):
|
323
|
+
if not global_server_args_dict["use_mla_backend"]:
|
324
|
+
from sglang.srt.layers.attention.flashinfer_backend import (
|
325
|
+
FlashInferAttnBackend,
|
313
326
|
)
|
314
|
-
|
327
|
+
|
328
|
+
return FlashInferAttnBackend(self.draft_model_runner, skip_prefill=False)
|
315
329
|
else:
|
330
|
+
from sglang.srt.layers.attention.flashinfer_mla_backend import (
|
331
|
+
FlashInferMLAAttnBackend,
|
332
|
+
)
|
333
|
+
|
334
|
+
return FlashInferMLAAttnBackend(self.draft_model_runner, skip_prefill=False)
|
335
|
+
|
336
|
+
def _create_triton_prefill_backend(self):
|
337
|
+
from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
|
338
|
+
|
339
|
+
return TritonAttnBackend(self.draft_model_runner, skip_prefill=False)
|
340
|
+
|
341
|
+
def _create_aiter_prefill_backend(self):
|
342
|
+
from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
|
343
|
+
|
344
|
+
return AiterAttnBackend(self.draft_model_runner, skip_prefill=False)
|
345
|
+
|
346
|
+
def _create_fa3_prefill_backend(self):
|
347
|
+
from sglang.srt.layers.attention.flashattention_backend import (
|
348
|
+
FlashAttentionBackend,
|
349
|
+
)
|
350
|
+
|
351
|
+
return FlashAttentionBackend(self.draft_model_runner, skip_prefill=False)
|
352
|
+
|
353
|
+
def _create_trtllm_mha_prefill_backend(self):
|
354
|
+
from sglang.srt.layers.attention.trtllm_mha_backend import TRTLLMHAAttnBackend
|
355
|
+
|
356
|
+
return TRTLLMHAAttnBackend(self.draft_model_runner, skip_prefill=False)
|
357
|
+
|
358
|
+
def _create_trtllm_mla_prefill_backend(self):
|
359
|
+
if not global_server_args_dict["use_mla_backend"]:
|
316
360
|
raise ValueError(
|
317
|
-
|
361
|
+
"trtllm_mla backend requires MLA model (use_mla_backend=True)."
|
318
362
|
)
|
319
363
|
|
320
|
-
|
364
|
+
from sglang.srt.layers.attention.trtllm_mla_backend import TRTLLMMLABackend
|
365
|
+
|
366
|
+
return TRTLLMMLABackend(self.draft_model_runner, skip_prefill=False)
|
321
367
|
|
322
368
|
def init_cuda_graphs(self):
|
323
369
|
"""Capture cuda graphs."""
|
sglang/srt/utils.py
CHANGED
@@ -2787,16 +2787,6 @@ def lru_cache_frozenset(maxsize=128):
|
|
2787
2787
|
return decorator
|
2788
2788
|
|
2789
2789
|
|
2790
|
-
def get_worker_ids_from_req_rids(rids):
|
2791
|
-
if isinstance(rids, list):
|
2792
|
-
worker_ids = [int(rid.split("_")[0]) for rid in rids]
|
2793
|
-
elif isinstance(rids, str):
|
2794
|
-
worker_ids = [int(rids.split("_")[0])]
|
2795
|
-
else:
|
2796
|
-
worker_ids = []
|
2797
|
-
return worker_ids
|
2798
|
-
|
2799
|
-
|
2800
2790
|
def get_origin_rid(rid):
|
2801
2791
|
return rid.split("_", 1)[1] if "_" in rid else rid
|
2802
2792
|
|
@@ -2910,6 +2900,18 @@ def mxfp_supported():
|
|
2910
2900
|
return False
|
2911
2901
|
|
2912
2902
|
|
2903
|
+
@lru_cache(maxsize=1)
|
2904
|
+
def is_gfx95_supported():
|
2905
|
+
"""
|
2906
|
+
Returns whether the current platform supports MX types.
|
2907
|
+
"""
|
2908
|
+
if torch.version.hip:
|
2909
|
+
gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
|
2910
|
+
return any(gfx in gcn_arch for gfx in ["gfx95"])
|
2911
|
+
else:
|
2912
|
+
return False
|
2913
|
+
|
2914
|
+
|
2913
2915
|
# LoRA-related constants and utilities
|
2914
2916
|
SUPPORTED_LORA_TARGET_MODULES = [
|
2915
2917
|
"q_proj",
|
sglang/test/few_shot_gsm8k.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
2
2
|
|
3
|
-
from typing import Optional
|
3
|
+
from typing import Literal, Optional
|
4
4
|
|
5
5
|
import pytest
|
6
6
|
import torch
|
@@ -25,7 +25,7 @@ def pack_int4_values_to_int8(int4_values_interleaved: torch.Tensor) -> torch.Ten
|
|
25
25
|
return packed_tensor.to(torch.int8)
|
26
26
|
|
27
27
|
|
28
|
-
def pack_interleave(num_experts, ref_weight, ref_scale):
|
28
|
+
def pack_interleave(num_experts, ref_weight, ref_scale, alignment=4):
|
29
29
|
n, k = ref_weight.shape[1], ref_weight.shape[2]
|
30
30
|
|
31
31
|
weight = pack_int4_values_to_int8(ref_weight.cpu()).cuda()
|
@@ -33,11 +33,16 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
|
|
33
33
|
w_q = w_q.contiguous()
|
34
34
|
|
35
35
|
scale_interleaved = ref_scale.reshape(
|
36
|
-
ref_scale.shape[0],
|
36
|
+
ref_scale.shape[0],
|
37
|
+
ref_scale.shape[1],
|
38
|
+
(ref_scale.shape[2] // alignment),
|
39
|
+
alignment,
|
37
40
|
) # [E, N, K/4, 4]
|
38
41
|
scale_interleaved = scale_interleaved.permute(0, 2, 1, 3) # [E, K/4, N, 4]
|
39
42
|
scale_interleaved = scale_interleaved.reshape(
|
40
|
-
ref_scale.shape[0],
|
43
|
+
ref_scale.shape[0],
|
44
|
+
ref_scale.shape[2] // alignment,
|
45
|
+
ref_scale.shape[1] * alignment,
|
41
46
|
) # [E, K/4, N*4]
|
42
47
|
w_scale = scale_interleaved.contiguous()
|
43
48
|
|
@@ -48,12 +53,17 @@ def pack_interleave(num_experts, ref_weight, ref_scale):
|
|
48
53
|
@pytest.mark.parametrize("N", [2048])
|
49
54
|
@pytest.mark.parametrize("K", [7168])
|
50
55
|
@pytest.mark.parametrize("E", [256])
|
51
|
-
@pytest.mark.parametrize("
|
56
|
+
@pytest.mark.parametrize("tp_size", [8])
|
57
|
+
@pytest.mark.parametrize("use_ep_moe", [True, False])
|
52
58
|
@pytest.mark.parametrize("topk", [8])
|
53
59
|
@pytest.mark.parametrize("group_size", [128])
|
54
60
|
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
55
|
-
def test_cutlass_w4a8_moe(M, N, K, E,
|
56
|
-
|
61
|
+
def test_cutlass_w4a8_moe(M, N, K, E, tp_size, use_ep_moe, topk, group_size, dtype):
|
62
|
+
if use_ep_moe:
|
63
|
+
local_e = E // tp_size
|
64
|
+
else: # tp mode
|
65
|
+
local_e = E
|
66
|
+
N = N // tp_size
|
57
67
|
|
58
68
|
debug = False
|
59
69
|
if debug:
|
@@ -87,7 +97,10 @@ def test_cutlass_w4a8_moe(M, N, K, E, ep_size, topk, group_size, dtype):
|
|
87
97
|
)
|
88
98
|
|
89
99
|
w1_q, w1_scale = pack_interleave(local_e, ref_weight_1, scale_1)
|
90
|
-
|
100
|
+
if use_ep_moe:
|
101
|
+
w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2)
|
102
|
+
else:
|
103
|
+
w2_q, w2_scale = pack_interleave(local_e, ref_weight_2, scale_2, 1)
|
91
104
|
|
92
105
|
device = "cuda"
|
93
106
|
a_strides1 = torch.full((local_e, 3), K, device=device, dtype=torch.int64)
|
@@ -265,7 +278,9 @@ def ref(
|
|
265
278
|
|
266
279
|
gate, fc1 = fc1.chunk(2, dim=-1)
|
267
280
|
fc1 = fc1 * torch.nn.functional.silu(gate)
|
268
|
-
act = (fc1 / pre_quant_scale_2.float()).to(
|
281
|
+
act = torch.clamp((fc1 / pre_quant_scale_2.float()), -448.0, 448.0).to(
|
282
|
+
torch.float8_e4m3fn
|
283
|
+
)
|
269
284
|
act = act.to(dtype)
|
270
285
|
|
271
286
|
w2 = ref_weight_2[e_idx]
|
sglang/utils.py
CHANGED
@@ -457,6 +457,7 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
|
|
457
457
|
NOTE: Typically, the server runs in a separate terminal.
|
458
458
|
In this notebook, we run the server and notebook code together, so their outputs are combined.
|
459
459
|
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
|
460
|
+
To reduce the log length, we set the log level to warning for the server, the default log level is info.
|
460
461
|
We are running those notebooks in a CI environment, so the throughput is not representative of the actual performance.
|
461
462
|
"""
|
462
463
|
)
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.2rc2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.2rc2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -257,7 +257,7 @@ Requires-Dist: uvloop; extra == "runtime-common"
|
|
257
257
|
Requires-Dist: xgrammar==0.1.23; extra == "runtime-common"
|
258
258
|
Provides-Extra: srt
|
259
259
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
260
|
-
Requires-Dist: sgl-kernel==0.3.
|
260
|
+
Requires-Dist: sgl-kernel==0.3.8; extra == "srt"
|
261
261
|
Requires-Dist: torch==2.8.0; extra == "srt"
|
262
262
|
Requires-Dist: torchaudio==2.8.0; extra == "srt"
|
263
263
|
Requires-Dist: torchvision; extra == "srt"
|