sglang 0.4.4.post4__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/lang/chat_template.py +24 -0
- sglang/srt/configs/model_config.py +4 -0
- sglang/srt/conversation.py +29 -4
- sglang/srt/layers/attention/flashattention_backend.py +286 -9
- sglang/srt/layers/moe/fused_moe_native.py +5 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +13 -3
- sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
- sglang/srt/layers/quantization/__init__.py +1 -0
- sglang/srt/layers/quantization/blockwise_int8.py +2 -0
- sglang/srt/layers/quantization/fp8.py +3 -1
- sglang/srt/layers/quantization/moe_wna16.py +2 -0
- sglang/srt/layers/quantization/w8a8_int8.py +2 -0
- sglang/srt/layers/radix_attention.py +2 -0
- sglang/srt/layers/rotary_embedding.py +63 -0
- sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
- sglang/srt/model_executor/model_runner.py +1 -0
- sglang/srt/models/llama.py +12 -4
- sglang/srt/models/llama4.py +420 -0
- sglang/srt/models/mllama4.py +154 -0
- sglang/version.py +1 -1
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.dist-info}/METADATA +1 -1
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.dist-info}/RECORD +32 -22
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.4.post4.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,154 @@
|
|
1
|
+
# TODO: add Aapted from vllm/mllama4.py
|
2
|
+
from collections.abc import Iterable
|
3
|
+
from typing import Optional, Set, Tuple
|
4
|
+
|
5
|
+
import torch
|
6
|
+
from torch import nn
|
7
|
+
from transformers import Llama4Config
|
8
|
+
|
9
|
+
from sglang.srt.layers.logits_processor import LogitsProcessor
|
10
|
+
from sglang.srt.layers.quantization import QuantizationConfig
|
11
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
12
|
+
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
13
|
+
from sglang.srt.utils import add_prefix
|
14
|
+
|
15
|
+
|
16
|
+
class Llama4ForConditionalGeneration(nn.Module):
|
17
|
+
packed_modules_mapping = {
|
18
|
+
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
19
|
+
}
|
20
|
+
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
config: Llama4Config,
|
24
|
+
quant_config: Optional[QuantizationConfig] = None,
|
25
|
+
prefix: str = "",
|
26
|
+
):
|
27
|
+
super().__init__()
|
28
|
+
self.config = config
|
29
|
+
self.quant_config = quant_config
|
30
|
+
|
31
|
+
# Initialize the language model
|
32
|
+
from sglang.srt.models.llama4 import Llama4ForCausalLM
|
33
|
+
|
34
|
+
self.language_model = Llama4ForCausalLM(
|
35
|
+
config.text_config,
|
36
|
+
quant_config=quant_config,
|
37
|
+
prefix=add_prefix("language_model", prefix),
|
38
|
+
)
|
39
|
+
|
40
|
+
self.logits_processor = LogitsProcessor(config.text_config)
|
41
|
+
|
42
|
+
def forward(
|
43
|
+
self,
|
44
|
+
input_ids: torch.Tensor,
|
45
|
+
positions: torch.Tensor,
|
46
|
+
forward_batch: ForwardBatch,
|
47
|
+
**kwargs: object,
|
48
|
+
) -> torch.Tensor:
|
49
|
+
|
50
|
+
return self.language_model(input_ids, positions, forward_batch)
|
51
|
+
|
52
|
+
def permute_qk_weight_for_rotary(
|
53
|
+
self,
|
54
|
+
name: str,
|
55
|
+
loaded_weight: torch.Tensor,
|
56
|
+
) -> Tuple[str, torch.Tensor]:
|
57
|
+
|
58
|
+
def permute(w: torch.Tensor, n_heads: int):
|
59
|
+
attn_in = self.language_model.config.head_dim * n_heads
|
60
|
+
attn_out = self.language_model.config.hidden_size
|
61
|
+
|
62
|
+
return (
|
63
|
+
w.view(n_heads, attn_in // n_heads // 2, 2, attn_out)
|
64
|
+
.transpose(1, 2)
|
65
|
+
.reshape(attn_in, attn_out)
|
66
|
+
)
|
67
|
+
|
68
|
+
modules = name.split(".")
|
69
|
+
|
70
|
+
# rotary embeds should be sliced
|
71
|
+
if ("wk" in modules or "k_proj" in modules) and modules[-1] == "weight":
|
72
|
+
loaded_weight = permute(
|
73
|
+
loaded_weight, self.language_model.config.num_key_value_heads
|
74
|
+
)
|
75
|
+
elif ("wq" in modules or "q_proj" in modules) and modules[-1] == "weight":
|
76
|
+
loaded_weight = permute(
|
77
|
+
loaded_weight, self.language_model.config.num_attention_heads
|
78
|
+
)
|
79
|
+
|
80
|
+
return name, loaded_weight
|
81
|
+
|
82
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
|
83
|
+
|
84
|
+
stacked_params_mapping = [
|
85
|
+
# (param_name, shard_name, shard_id)
|
86
|
+
(".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
|
87
|
+
(".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
|
88
|
+
(".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
|
89
|
+
(".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
|
90
|
+
(".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
|
91
|
+
(".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
|
92
|
+
(".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
|
93
|
+
]
|
94
|
+
|
95
|
+
params_dict = dict(self.named_parameters())
|
96
|
+
|
97
|
+
num_experts = self.config.text_config.num_local_experts
|
98
|
+
|
99
|
+
for name, loaded_weight in weights:
|
100
|
+
|
101
|
+
if name.startswith("vision_model") or name.startswith(
|
102
|
+
"multi_modal_projector"
|
103
|
+
):
|
104
|
+
continue
|
105
|
+
|
106
|
+
name, loaded_weight = self.permute_qk_weight_for_rotary(name, loaded_weight)
|
107
|
+
|
108
|
+
for param_name, weight_name, shard_id in stacked_params_mapping:
|
109
|
+
if weight_name not in name:
|
110
|
+
continue
|
111
|
+
name = name.replace(weight_name, param_name)
|
112
|
+
param = params_dict[name]
|
113
|
+
weight_loader = param.weight_loader
|
114
|
+
weight_loader(param, loaded_weight, shard_id)
|
115
|
+
break
|
116
|
+
else:
|
117
|
+
if ".experts" in name:
|
118
|
+
if ".gate_up_proj" in name:
|
119
|
+
name_list = [
|
120
|
+
name.replace(".experts.gate_up_proj", ".experts.w13_weight")
|
121
|
+
] * 2
|
122
|
+
loaded_weight_list = loaded_weight.chunk(2, dim=-1)
|
123
|
+
shard_id_list = ["w1", "w3"]
|
124
|
+
else:
|
125
|
+
name_list = [
|
126
|
+
name.replace(".experts.down_proj", ".experts.w2_weight")
|
127
|
+
]
|
128
|
+
shard_id_list = ["w2"]
|
129
|
+
loaded_weight_list = [loaded_weight]
|
130
|
+
for name, loaded_weight, shard_id in zip(
|
131
|
+
name_list, loaded_weight_list, shard_id_list
|
132
|
+
):
|
133
|
+
param = params_dict[name]
|
134
|
+
weight_loader = param.weight_loader
|
135
|
+
for expert_id in range(num_experts):
|
136
|
+
weight_loader(
|
137
|
+
param,
|
138
|
+
loaded_weight[expert_id].T,
|
139
|
+
name,
|
140
|
+
shard_id=shard_id,
|
141
|
+
expert_id=expert_id,
|
142
|
+
)
|
143
|
+
else:
|
144
|
+
# Skip loading extra bias for GPTQ models.
|
145
|
+
if name.endswith(".bias") and name not in params_dict:
|
146
|
+
continue
|
147
|
+
param = params_dict[name]
|
148
|
+
weight_loader = getattr(
|
149
|
+
param, "weight_loader", default_weight_loader
|
150
|
+
)
|
151
|
+
weight_loader(param, loaded_weight)
|
152
|
+
|
153
|
+
|
154
|
+
EntryClass = Llama4ForConditionalGeneration
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.5"
|
@@ -9,9 +9,9 @@ sglang/global_config.py,sha256=xzLdk8W53fneFblNh8iIjGF9C3-7mnzR1-LleD9Btxg,1495
|
|
9
9
|
sglang/launch_server.py,sha256=mDXfwha8LHpWQJekcCosR98QhCQsbmilsBlI5jAIgg0,420
|
10
10
|
sglang/llama3_eval.py,sha256=gWSboDchIGybIce88bJlrCG0yiLZ513mw4gcutJlzGM,10017
|
11
11
|
sglang/utils.py,sha256=GIcgiRHkZ-gyPxXOdn1qFF41jkg4-YdDxbPc4mzO-qk,16159
|
12
|
-
sglang/version.py,sha256=
|
12
|
+
sglang/version.py,sha256=ErkLkI2TDBX1OIqi2GGa20CPeu4ZculEi-XffRbLU6M,22
|
13
13
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
sglang/lang/chat_template.py,sha256=
|
14
|
+
sglang/lang/chat_template.py,sha256=MwNL5dNTe8g_l2ljZubnrazEgT2xEv-9O2D0Ezwxy4I,19658
|
15
15
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
16
16
|
sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
|
17
17
|
sglang/lang/interpreter.py,sha256=OH1SFCm4rUCPO32MTo8j5V2Z13Jic7_r1GQOP1-aHaw,33234
|
@@ -27,7 +27,7 @@ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bE
|
|
27
27
|
sglang/srt/_custom_ops.py,sha256=lUBwC5R2UfjFMA1EtC5Kh2IngsqBJM9IuMW46kJWcjE,3647
|
28
28
|
sglang/srt/aio_rwlock.py,sha256=6LYtOdeTUY3hkfa1dmYkgsaF2ttrwIF3hUWz2AZ2fqw,2970
|
29
29
|
sglang/srt/code_completion_parser.py,sha256=HhEUzdL-FVBsOot9tKDKA1l8Gdx8qsF1RRg-zHNpmLQ,5400
|
30
|
-
sglang/srt/conversation.py,sha256=
|
30
|
+
sglang/srt/conversation.py,sha256=WP72AZrZpiqc5RowucT2tW3jVCb1pb4veW_kpwYS4yY,28785
|
31
31
|
sglang/srt/custom_op.py,sha256=bIZ__3FiZvkbsN9O_jeLy_49X7ZbYbw0VxoL80uWwaI,3715
|
32
32
|
sglang/srt/function_call_parser.py,sha256=buYENeNEP5bhsvD424yGCa9wOqSfVOZSRn6zLiSJp5I,23733
|
33
33
|
sglang/srt/hf_transformers_utils.py,sha256=_QYTl9LpU0jmKPlYooHi1etwMvb5v40JIrG_t_Fx06w,9215
|
@@ -48,7 +48,7 @@ sglang/srt/configs/device_config.py,sha256=kfmpPOECqYxcRoY-ko0QZRhyiBWUGP2CMF51D
|
|
48
48
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
49
49
|
sglang/srt/configs/janus_pro.py,sha256=-QtJ4ZGZiAJb0AkOEcuCHzIKLw23nF8nRk3rdCcoUO0,19016
|
50
50
|
sglang/srt/configs/load_config.py,sha256=qs-AxuplouBx2tsv9KGBOLZPbwzuVA4vbktbGP_cRp8,3309
|
51
|
-
sglang/srt/configs/model_config.py,sha256
|
51
|
+
sglang/srt/configs/model_config.py,sha256=ZioUnc5UzsBVEYHE_GgCofYL97MByZm2NfHikS9HwLo,20771
|
52
52
|
sglang/srt/configs/utils.py,sha256=3nHUfisMs_Ltuhv8OZTNCJp63YJKJVF43h1QZB1zqx8,670
|
53
53
|
sglang/srt/connector/__init__.py,sha256=czLX5JOxuMhH-T9eSJzoc1qv1B4z9chyffDRL5I6wo4,1247
|
54
54
|
sglang/srt/connector/base_connector.py,sha256=i6i1TIzsz4NbSEkrdMPq-urb2sN2aLAx8dazga4gB9U,2833
|
@@ -91,14 +91,14 @@ sglang/srt/layers/linear.py,sha256=HYIGxpRYL6x-jNOkyNtGAw5Ak9Nq8jkntddgTBER_1w,5
|
|
91
91
|
sglang/srt/layers/logits_processor.py,sha256=Vp8ibljVEezTr54xzeOcjiJR7JdYO8ItkO5nLIIMVu0,24206
|
92
92
|
sglang/srt/layers/parameter.py,sha256=0OTMtmsNds42e3z3wHTRJiUfxCWFwSL6DHrqgeTgGt8,15151
|
93
93
|
sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
|
94
|
-
sglang/srt/layers/radix_attention.py,sha256=
|
95
|
-
sglang/srt/layers/rotary_embedding.py,sha256=
|
94
|
+
sglang/srt/layers/radix_attention.py,sha256=4xRq0w9yDfAVdNlBToQpmc7irq-pomJm-GlIfMtpYtk,2328
|
95
|
+
sglang/srt/layers/rotary_embedding.py,sha256=1nznPQ6EfVMDSRabKqifEE2xtMFwSri-kEepMaWdZeI,45340
|
96
96
|
sglang/srt/layers/sampler.py,sha256=yipSyN5UWGwGS-BC-WzWMmelys4CCDtK_8b1OpaK6sM,11622
|
97
97
|
sglang/srt/layers/torchao_utils.py,sha256=Ws24FdRBSkTpyeyA6bQrdDm-W5wfDxKvSIPUSahyMfA,4063
|
98
98
|
sglang/srt/layers/vocab_parallel_embedding.py,sha256=QUxd4sELx6p3dHvEKmccPZ-phdd_9EjNdwjH3SJ9zxI,22238
|
99
99
|
sglang/srt/layers/attention/base_attn_backend.py,sha256=X_GIbQuU9njtUEGdUP7E_KRhmGxj3UyPHNESlL3QaQ8,3264
|
100
100
|
sglang/srt/layers/attention/double_sparsity_backend.py,sha256=2ZRL_gYz14idoVqQzeQ6N77nXer0f_8_TUYw40XUUz0,9161
|
101
|
-
sglang/srt/layers/attention/flashattention_backend.py,sha256=
|
101
|
+
sglang/srt/layers/attention/flashattention_backend.py,sha256=ORtcSJUDbV2qfKGkq9ohiy8JJ1SU9R2I5fSMizF4EhI,42572
|
102
102
|
sglang/srt/layers/attention/flashinfer_backend.py,sha256=3fxS2NQzCBw7h_gLxBjHcyDkf2quWqBxr_N01lYmfJo,45865
|
103
103
|
sglang/srt/layers/attention/flashinfer_mla_backend.py,sha256=pnVhvVEK87iFW8gUb1G7X7c1tqro8R2DSEOFCnlV8Bo,30301
|
104
104
|
sglang/srt/layers/attention/flashmla_backend.py,sha256=1RPFNtQOBw6BWxIjrzfJgA9Nx92udLbR-S5KXmqjxS8,10536
|
@@ -111,7 +111,7 @@ sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=BXUY8
|
|
111
111
|
sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=hbBvnhc2zqu-E3HNROVXyNOZbtDkVRuFus-yTjmE0Sg,13668
|
112
112
|
sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=Y66gZ37u0GKMPtI8n5MbO6uOxRuGEmKIG0IPbJTOqAM,6213
|
113
113
|
sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py,sha256=664WnAJ91EiCUZOcnVDfbTQf4uGJ4ZDZB1CbxpEUFZc,13866
|
114
|
-
sglang/srt/layers/moe/fused_moe_native.py,sha256=
|
114
|
+
sglang/srt/layers/moe/fused_moe_native.py,sha256=bf0po921lY9xnlZivdJly0bGIYFlLqp5v8Mz7tG5bdg,4451
|
115
115
|
sglang/srt/layers/moe/router.py,sha256=gvyK7hXlujfCZCmAIFc3oxfgjuAjzlpPe3mp1Blc6Y0,10419
|
116
116
|
sglang/srt/layers/moe/topk.py,sha256=iUb-64CaNAUfvBZ1pkgsedcLRQs2sVSIzQ5300WmdXI,10242
|
117
117
|
sglang/srt/layers/moe/ep_moe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -119,8 +119,8 @@ sglang/srt/layers/moe/ep_moe/kernels.py,sha256=ijqRzS-tb0LGnDU5hW-g0JH104ppADrWa
|
|
119
119
|
sglang/srt/layers/moe/ep_moe/layer.py,sha256=1TmWnxv-bW1Qbgru-V-vGnt3ruuTIwHQy0Y5ZA_xzvE,36824
|
120
120
|
sglang/srt/layers/moe/ep_moe/token_dispatcher.py,sha256=jnr6KSM8YooftTjZ3gYe0eWpOd1dmkXqk4hKRvLTwCo,19708
|
121
121
|
sglang/srt/layers/moe/fused_moe_triton/__init__.py,sha256=h9yMFAL_bagUf-qBED8gSWdCOb7d8IdA-pE-L_nIg8E,842
|
122
|
-
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=
|
123
|
-
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=
|
122
|
+
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py,sha256=sjAXnjUmLXPpvFFL4VShBce_9xygWY2twAQJ74OJ_ZQ,54500
|
123
|
+
sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=43-UL9KEMoaiC0cRSzWFbg2PADtcoxfZqjZ6TOvQ7Vk,24551
|
124
124
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=iNGsE2ZeVnQEnN4A8UJ9Jv0d3hbRF2MJ9oBgjup5Szk,2737
|
125
125
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=JJN0hryyLr5Zv3dSS7C8cPFhAwTT6XxUVnBGMZvV6JA,2752
|
126
126
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=ouRyZ5PEMPP2njPftCNhs-1g1y6wueWLmhI7G1SjV1k,4131
|
@@ -132,6 +132,10 @@ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=CYT3ujh5ifonhqQc1uYSa6maJ
|
|
132
132
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=yf33YmWlVSjjyg0Q4OMAWvc9gjRxvttMrQBUEOfPl4I,4153
|
133
133
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=ZWMClYN1moVRUP2f0hYac38di_pUgZggyl9d2D5rnoc,4136
|
134
134
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=C65Q2Mv1LxFQ_qDnv11IZ9nwl7sGZo72nWDflMttu4g,4147
|
135
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=FsWbV4Q6AzAtgegVuENBDz2ZcSJsqNiwUIVfQbpP7hQ,3244
|
136
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=pk6VZChh2Y0CsJSzjtUhOnlta1QLTUEWy33aKQU47XY,3244
|
137
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=Gmk24hc5lVIfQtqSa5wLOcWKedMN8aZUe93DBh6J1AY,3249
|
138
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json",sha256=uY_XMPomaXMXxIkTR4ctU_Ybri_jMv2VvCcV-f6O_bw,3255
|
135
139
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json",sha256=pCCKkdUzzuBVtljyk7AEIAbeDf12DUiieXaODZXzm5E,3254
|
136
140
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=trX2-c4N6hTTD6zFNi6A2bT3FkhxKjkM2rPl-o1K9ss,3250
|
137
141
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=I4d56uD7E1JMXD9RAxq3FebdPquDsnNEkVaIY9Ctm9w,3246
|
@@ -139,6 +143,7 @@ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=CYT3ujh5ifonhqQc1uYSa6maJ
|
|
139
143
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=0ejgnIQ_mzJhKjQpnT-I1Vj9-rPfGlTcQ8u0cXgekUw,2746
|
140
144
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=lU4fjngJmpzOafi-3_q0vj2pLfZQVVagFnZNoI97etk,4128
|
141
145
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=ilUf76TmbtaL3W0bara6JIBEiV_iPIs29UjLbH_GYtc,4145
|
146
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=4ye1VakIIzrGUTxGNb_DM5v59djLgok8SeBeU_YI6Go,3252
|
142
147
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json",sha256=G4PKqWxh0MlBhg7QHKj0m--_fP3Ll0gs7VJaeg-NIDM,3254
|
143
148
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=bKX9AvcxN6k-i3RUmHSchZZ3rjoYRYb4iBqhCI4L3MY,3257
|
144
149
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json",sha256=OeJUCPVBD1z9CSooZpy3hRyAasjKqFAQaTLcWK6PWno,2741
|
@@ -152,6 +157,8 @@ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=CYT3ujh5ifonhqQc1uYSa6maJ
|
|
152
157
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json",sha256=kklgf2qLI5CQYiJJ5e9Gxx2gAfGxcyMDYpdJnIXPV8E,2748
|
153
158
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=8e0tN_DHPwvh_HECVHx9oOF_4WWdaht4s6Nmd_K-aBU,2904
|
154
159
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json",sha256=fRzKfVFIcnxqu6DvGJQNltuFRRGz8F-eaL73bIzBzo8,3255
|
160
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=zq1x3KwZ3YuzE42OPPsElXT_VS9sK135CdB1RsotLU4,3252
|
161
|
+
"sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json",sha256=lJbEa7VsBVKL_iRoHniB-2nceMrPctkqtjKszPnX4pk,3248
|
155
162
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json",sha256=RuUDK9XfgXs1eZESWQR9ba4tu-rCRG_UCYwjaJ568sI,3264
|
156
163
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json",sha256=wlCi9aoYp7Zc1GThEutvWDbse0kKnNaQgFJsd_L8be0,3259
|
157
164
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json",sha256=tPYxeo_xUOkjQrZMdf9v4IaFrw0RGaZNLGLJPOhjE_g,3260
|
@@ -243,11 +250,11 @@ sglang/srt/layers/moe/fused_moe_triton/layer.py,sha256=CYT3ujh5ifonhqQc1uYSa6maJ
|
|
243
250
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json",sha256=-RzUWSIAAsg6iA-8SPMa68hPpBVoUyMJs3dLP7edRu0,4323
|
244
251
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json",sha256=sY2nWMPh9lsIkhPCjkHO245wpnfFbrHmzdcZDVFPVww,3265
|
245
252
|
"sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json",sha256=Uz5X80VcNBOaxshwVNUEittHk2zqB4HQCfTJ4TPG5aM,3274
|
246
|
-
sglang/srt/layers/quantization/__init__.py,sha256=
|
253
|
+
sglang/srt/layers/quantization/__init__.py,sha256=yokDLpqQZ6eIIeaBZggJG-oS4h3TmroXZHLL40YykeM,12159
|
247
254
|
sglang/srt/layers/quantization/awq.py,sha256=VImnVCU_QBLFba6S88T0dJ-vLy6SMm3OLIMEdllDfVI,6663
|
248
255
|
sglang/srt/layers/quantization/base_config.py,sha256=jWk_egQrVNMYmQgbTI9vkcgzScLFjB5_sywFlAfE5J0,4776
|
249
|
-
sglang/srt/layers/quantization/blockwise_int8.py,sha256=
|
250
|
-
sglang/srt/layers/quantization/fp8.py,sha256=
|
256
|
+
sglang/srt/layers/quantization/blockwise_int8.py,sha256=yE8ARplbha1sW1Szl-mgsRDzGTRpEZY_zAKkCJIu680,15010
|
257
|
+
sglang/srt/layers/quantization/fp8.py,sha256=J5D_KdRYiOQ4NCbjoKfYDHdIgCGMy-tQwHlTiG44pJc,41189
|
251
258
|
sglang/srt/layers/quantization/fp8_kernel.py,sha256=JRalHJ-btDpzl3oXu2R_ZoJBu5TzBBmW_wKZDFs-usQ,24384
|
252
259
|
sglang/srt/layers/quantization/fp8_utils.py,sha256=CDR2fLrZa_mZ86n5S2dDjYMpVCGa2n7gCXd2BYZjXcM,21391
|
253
260
|
sglang/srt/layers/quantization/gptq.py,sha256=e4rMz374-yQQqeAI77WPxfcAaRk38GeN2akEpvnC_Do,15141
|
@@ -255,10 +262,10 @@ sglang/srt/layers/quantization/int8_kernel.py,sha256=GfRn_imIw8kNgqdtb2lr7Bettjg
|
|
255
262
|
sglang/srt/layers/quantization/int8_utils.py,sha256=YK9CS-lb_n91kNCTKK5o5apYF31V2giDg5G5VKrpcUA,2356
|
256
263
|
sglang/srt/layers/quantization/kv_cache.py,sha256=rJi6amyLZsquUMo_V5iLlPMqdsGTLgxh4popN1xUHCQ,4236
|
257
264
|
sglang/srt/layers/quantization/modelopt_quant.py,sha256=mne4uKF0R-K0OvWN7X5ZxD4LdXKBc6GvmpZzIW6gkmM,6969
|
258
|
-
sglang/srt/layers/quantization/moe_wna16.py,sha256=
|
265
|
+
sglang/srt/layers/quantization/moe_wna16.py,sha256=3Z8Eq4_ehTN5EEotlYC09FpUNmF8VO8uv7QzUqJa0QI,19371
|
259
266
|
sglang/srt/layers/quantization/utils.py,sha256=QqGFwRnFenOm5HfyLoS4D06_LyvNWgOggAiFtZXTpQ4,5637
|
260
267
|
sglang/srt/layers/quantization/w8a8_fp8.py,sha256=XcQdgqXA3eKbAf-4_0I81Y5Nvjns3bQTocovnN8141w,6234
|
261
|
-
sglang/srt/layers/quantization/w8a8_int8.py,sha256=
|
268
|
+
sglang/srt/layers/quantization/w8a8_int8.py,sha256=oLURfgMpsES8qLf0CIJ-4rfQgBGf452Lo0U6tvq6jH0,8856
|
262
269
|
sglang/srt/layers/quantization/compressed_tensors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
263
270
|
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py,sha256=ngKVSHfQUNSZzrLMu4Iv_4Fzt2eOoOIZKcO2RNDiwAM,25353
|
264
271
|
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py,sha256=roqRrIJybA9YuN3kqSeoLTJhXfTHOOtJd5MkenpOL8E,25835
|
@@ -458,6 +465,7 @@ sglang/srt/managers/multimodal_processors/janus_pro.py,sha256=wZs4HZhPov7yvV2VU2
|
|
458
465
|
sglang/srt/managers/multimodal_processors/llava.py,sha256=8mac3vUUpVd12o43k1TyMaLEySZB915ks8Q5epeZmbg,6209
|
459
466
|
sglang/srt/managers/multimodal_processors/minicpm.py,sha256=Mq-iH2j90VrGAbSaF3ayYWhTEm9RvWNI6ZhBb6G23dQ,5684
|
460
467
|
sglang/srt/managers/multimodal_processors/mlama.py,sha256=MLiGS606LzVtdoXvjWGANx-K_7nE9J_fMVmkXN7Gz8k,1661
|
468
|
+
sglang/srt/managers/multimodal_processors/mllama4.py,sha256=K6OKhSZOoaHwrRt0ZVi3gi2vnzMVHWJb5n3fUoStwIs,6188
|
461
469
|
sglang/srt/managers/multimodal_processors/qwen_vl.py,sha256=67EmFiAkvZncU-eqiiS0Q4dr3pWcfI-RofYiQnNWvu0,5722
|
462
470
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=NY62Zo0A0tLJ7ObRLOQqQcXCxoJUDZsK8f5U4dNQjKc,973
|
463
471
|
sglang/srt/mem_cache/chunk_cache.py,sha256=it5SfL1FwMbrdeOH-I-Eu_i-I9hFB1xL-z_brIUoCkk,1835
|
@@ -470,7 +478,7 @@ sglang/srt/metrics/collector.py,sha256=aCxHqgsQ6P8ZxsAvq_MoEVsr3KUvIUSOBpGYMgBxm
|
|
470
478
|
sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
|
471
479
|
sglang/srt/model_executor/cuda_graph_runner.py,sha256=bDLOqlxdwRUyKitG8JyZygnm05N00q-TdNiAayG_T8o,23223
|
472
480
|
sglang/srt/model_executor/forward_batch_info.py,sha256=8VI1VxSmyH26lIHnCNeGqYw2XxslbqN_cuSUIEPUtRU,19468
|
473
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
481
|
+
sglang/srt/model_executor/model_runner.py,sha256=4Xi-1u1tTC34uK_DtYEaj7VtvPjDDgMzRaXeJ5kpsQE,45076
|
474
482
|
sglang/srt/model_loader/__init__.py,sha256=zGZkOBz1zx-pkaIy47BasL3fjDlAcxAXUTjInOhXHAE,919
|
475
483
|
sglang/srt/model_loader/loader.py,sha256=AUS4SqSFghbQjs29C65lg7_zxR9h1t7N5G0gERjc0Rc,54238
|
476
484
|
sglang/srt/model_loader/utils.py,sha256=0NaMR67fESFopaklmsleiL27XH1QUrjZW246MUu1EJ0,1369
|
@@ -497,7 +505,8 @@ sglang/srt/models/granite.py,sha256=nu_Zl_PYn188gk1uYVZ76y4wwHZV7G0w7uanhqpSFUs,
|
|
497
505
|
sglang/srt/models/grok.py,sha256=pQOXtpHOYVntwt5QQRLffYsnMHmMfPMmGyKMfR0k0Ic,27994
|
498
506
|
sglang/srt/models/internlm2.py,sha256=4eh9WVgK4yg13IsnH5qB2xUCWnixj_aLLz7qa_4m2_Q,13017
|
499
507
|
sglang/srt/models/internlm2_reward.py,sha256=ndfGmyqYZbVZ7C7rJ-v9oK3wa-EpoBGybS8MlyKZi2E,2522
|
500
|
-
sglang/srt/models/llama.py,sha256=
|
508
|
+
sglang/srt/models/llama.py,sha256=gcl2YtnM54J_fZQx2Z26LMm7vPbWN7N1CjzlaBEA3zk,24893
|
509
|
+
sglang/srt/models/llama4.py,sha256=4WqHX6YPBrlJVA7HoQTMUfdoU_mEhpWSgoFaeKdhdCE,15018
|
501
510
|
sglang/srt/models/llama_classification.py,sha256=4QWTFaUZIFKYZvEzs8bx8VkOZNIwdYCLrnwrdAw4QK0,3108
|
502
511
|
sglang/srt/models/llama_eagle.py,sha256=OB2lKsjn7BcfCZljklnhk83me8j0PuQmYLou7baNcq4,4866
|
503
512
|
sglang/srt/models/llama_eagle3.py,sha256=v3bftBVDIGjnzngQYnu19cy0J_3w7yruHqLP5nsAQDM,6642
|
@@ -513,6 +522,7 @@ sglang/srt/models/mistral.py,sha256=EYifJUUzN2Z2-iL37eJiNZF_DB0H4pa0mKlgYRIxM70,
|
|
513
522
|
sglang/srt/models/mixtral.py,sha256=6Fse2J-20IMylP-yzpEihIinaH37TmmslATbLcWBRYY,14926
|
514
523
|
sglang/srt/models/mixtral_quant.py,sha256=MSa6UKPbgv8Rn8Iv8o1dQhcstAHLNQzE0eepFx_hYSw,15221
|
515
524
|
sglang/srt/models/mllama.py,sha256=SsK_cEolaeoXh_HkyXsSF2ueYR3sPv1NvnGH2k6Aqx0,38461
|
525
|
+
sglang/srt/models/mllama4.py,sha256=E2mCxJ1zCt6Io4LL4Rtt5uqMj7Jy971234ZcuyJZxSo,5800
|
516
526
|
sglang/srt/models/olmo.py,sha256=FJk8A3T3TF5QcTV6rMP8np94QtvxpMWlgCsv_5VwpVE,12632
|
517
527
|
sglang/srt/models/olmo2.py,sha256=U0ScFzWazOrb_Q90sfXkpVNAsXT-pgZbNgGh80R40VE,14288
|
518
528
|
sglang/srt/models/olmoe.py,sha256=tx5OKWLOr6_pohe2eBcIodCmcuSjtpteHq_tG_QVYCY,15910
|
@@ -569,8 +579,8 @@ sglang/test/test_programs.py,sha256=VZ3vXtUDBnXz0M7gFdDH8hXg9Wa0j_qI8CVqjEgRN_E,
|
|
569
579
|
sglang/test/test_utils.py,sha256=jUkIDxJ7I8hCPk0XF7F_IWJkOtn6O7eXJG5pI0cduwo,30463
|
570
580
|
sglang/test/attention/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
571
581
|
sglang/test/attention/test_flashattn_backend.py,sha256=OxS1KsPs19nwZcDtdURj7_liT1cIfEXb6W4FH9KMaaE,10808
|
572
|
-
sglang-0.4.
|
573
|
-
sglang-0.4.
|
574
|
-
sglang-0.4.
|
575
|
-
sglang-0.4.
|
576
|
-
sglang-0.4.
|
582
|
+
sglang-0.4.5.dist-info/licenses/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
|
583
|
+
sglang-0.4.5.dist-info/METADATA,sha256=dFvXPJ-aE-juLKgxD5l8wflGgO1cHg2jHjScLX_Ftjw,25061
|
584
|
+
sglang-0.4.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
585
|
+
sglang-0.4.5.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
586
|
+
sglang-0.4.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|