sglang 0.5.2rc0__py3-none-any.whl → 0.5.2rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/configs/model_config.py +2 -1
- sglang/srt/distributed/parallel_state.py +3 -1
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +1 -9
- sglang/srt/layers/moe/ep_moe/layer.py +2 -7
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -1048
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +796 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +8 -0
- sglang/srt/layers/quantization/w4afp8.py +30 -25
- sglang/srt/managers/detokenizer_manager.py +0 -34
- sglang/srt/managers/multi_tokenizer_mixin.py +44 -6
- sglang/srt/managers/scheduler.py +3 -0
- sglang/srt/mem_cache/hiradix_cache.py +19 -3
- sglang/srt/mem_cache/memory_pool_host.py +2 -0
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +27 -6
- sglang/srt/models/deepseek_v2.py +5 -0
- sglang/srt/models/gpt_oss.py +5 -4
- sglang/srt/models/longcat_flash.py +26 -15
- sglang/srt/models/longcat_flash_nextn.py +23 -15
- sglang/srt/utils.py +0 -10
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/version.py +1 -1
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/METADATA +2 -2
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/RECORD +32 -29
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc0.dist-info → sglang-0.5.2rc1.dist-info}/top_level.txt +0 -0
@@ -405,9 +405,10 @@ class ModelConfig:
|
|
405
405
|
# compressed-tensors uses a "compression_config" key
|
406
406
|
quant_cfg = getattr(self.hf_config, "compression_config", None)
|
407
407
|
if quant_cfg is None:
|
408
|
-
# check if is modelopt model --
|
408
|
+
# check if is modelopt or mixed-precision model -- Both of them don't have corresponding field
|
409
409
|
# in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
|
410
410
|
# example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
|
411
|
+
# example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
|
411
412
|
is_local = os.path.exists(self.model_path)
|
412
413
|
modelopt_quant_config = {"quant_method": "modelopt"}
|
413
414
|
if not is_local:
|
@@ -43,6 +43,7 @@ from sglang.srt.utils import (
|
|
43
43
|
direct_register_custom_op,
|
44
44
|
get_bool_env_var,
|
45
45
|
get_int_env_var,
|
46
|
+
is_cpu,
|
46
47
|
is_cuda_alike,
|
47
48
|
is_hip,
|
48
49
|
is_npu,
|
@@ -51,6 +52,7 @@ from sglang.srt.utils import (
|
|
51
52
|
)
|
52
53
|
|
53
54
|
_is_npu = is_npu()
|
55
|
+
_is_cpu = is_cpu()
|
54
56
|
|
55
57
|
IS_ONE_DEVICE_PER_PROCESS = get_bool_env_var("SGLANG_ONE_DEVICE_PER_PROCESS")
|
56
58
|
|
@@ -1643,7 +1645,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
|
1643
1645
|
|
1644
1646
|
ray.shutdown()
|
1645
1647
|
gc.collect()
|
1646
|
-
if not
|
1648
|
+
if not _is_cpu:
|
1647
1649
|
if hasattr(torch, "cuda") and torch.cuda.is_available():
|
1648
1650
|
torch.cuda.empty_cache()
|
1649
1651
|
if hasattr(torch._C, "_host_emptyCache"):
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -681,7 +681,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
681
681
|
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
|
682
682
|
assert_pkg_version(
|
683
683
|
"sgl-kernel",
|
684
|
-
"0.3.
|
684
|
+
"0.3.8",
|
685
685
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
686
686
|
)
|
687
687
|
|
@@ -91,18 +91,10 @@ def cutlass_w4a8_moe(
|
|
91
91
|
assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
|
92
92
|
assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
|
93
93
|
assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
|
94
|
-
assert (
|
95
|
-
w1_scale.shape[1] == w1_q.shape[2] * 2 / 512
|
96
|
-
and w1_scale.shape[2] == w1_q.shape[1] * 4
|
97
|
-
), "W1 scale shape mismatch"
|
98
|
-
assert (
|
99
|
-
w2_scale.shape[1] == w2_q.shape[2] * 2 / 512
|
100
|
-
and w2_scale.shape[2] == w2_q.shape[1] * 4
|
101
|
-
), "W2 scale shape mismatch"
|
102
94
|
|
103
95
|
assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
|
104
96
|
assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
|
105
|
-
assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number
|
97
|
+
assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
|
106
98
|
assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
|
107
99
|
num_experts = w1_q.size(0)
|
108
100
|
m = a.size(0)
|
@@ -114,9 +114,6 @@ class EPMoE(FusedMoE):
|
|
114
114
|
with_bias=with_bias,
|
115
115
|
)
|
116
116
|
|
117
|
-
self.start_expert_id = self.moe_ep_rank * self.num_local_experts
|
118
|
-
self.end_expert_id = self.start_expert_id + self.num_local_experts - 1
|
119
|
-
|
120
117
|
self.intermediate_size = intermediate_size
|
121
118
|
|
122
119
|
if isinstance(quant_config, Fp8Config):
|
@@ -232,7 +229,7 @@ class EPMoE(FusedMoE):
|
|
232
229
|
(
|
233
230
|
_cast_to_e8m0_with_rounding_up(gateup_input_scale)
|
234
231
|
if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
|
235
|
-
else deep_gemm_wrapper.
|
232
|
+
else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(
|
236
233
|
gateup_input_scale
|
237
234
|
)
|
238
235
|
),
|
@@ -289,9 +286,7 @@ class EPMoE(FusedMoE):
|
|
289
286
|
(
|
290
287
|
down_input_scale
|
291
288
|
if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
|
292
|
-
else deep_gemm_wrapper.
|
293
|
-
down_input_scale
|
294
|
-
)
|
289
|
+
else deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(down_input_scale)
|
295
290
|
),
|
296
291
|
)
|
297
292
|
down_output = torch.empty(
|
@@ -1,16 +1,18 @@
|
|
1
1
|
from contextlib import contextmanager
|
2
2
|
from typing import Any, Dict, Optional
|
3
3
|
|
4
|
-
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import
|
5
|
-
|
4
|
+
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
|
5
|
+
from sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_config import (
|
6
6
|
get_config_file_name,
|
7
|
-
moe_align_block_size,
|
8
7
|
try_get_optimal_moe_config,
|
9
8
|
)
|
10
9
|
from sglang.srt.layers.moe.fused_moe_triton.layer import (
|
11
10
|
FusedMoE,
|
12
11
|
FusedMoeWeightScaleSupported,
|
13
12
|
)
|
13
|
+
from sglang.srt.layers.moe.fused_moe_triton.moe_align_block_size import (
|
14
|
+
moe_align_block_size,
|
15
|
+
)
|
14
16
|
|
15
17
|
_config: Optional[Dict[str, Any]] = None
|
16
18
|
|