sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/api.py +2 -2
- sglang/bench_latency.py +1 -553
- sglang/bench_offline_throughput.py +48 -20
- sglang/bench_one_batch.py +472 -0
- sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
- sglang/bench_serving.py +125 -6
- sglang/check_env.py +3 -6
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +2 -2
- sglang/srt/configs/model_config.py +13 -14
- sglang/srt/constrained/__init__.py +13 -14
- sglang/srt/constrained/base_grammar_backend.py +13 -15
- sglang/srt/constrained/outlines_backend.py +28 -17
- sglang/srt/constrained/outlines_jump_forward.py +13 -15
- sglang/srt/constrained/xgrammar_backend.py +47 -58
- sglang/srt/conversation.py +13 -15
- sglang/srt/hf_transformers_utils.py +13 -15
- sglang/srt/layers/activation.py +16 -13
- sglang/srt/layers/attention/flashinfer_backend.py +106 -54
- sglang/srt/layers/attention/triton_backend.py +9 -7
- sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
- sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
- sglang/srt/layers/custom_op_util.py +25 -0
- sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
- sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
- sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
- sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
- sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
- sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
- sglang/srt/layers/fused_moe_triton/layer.py +633 -0
- sglang/srt/layers/layernorm.py +17 -15
- sglang/srt/layers/logits_processor.py +23 -25
- sglang/srt/layers/quantization/__init__.py +77 -17
- sglang/srt/layers/radix_attention.py +13 -15
- sglang/srt/layers/rotary_embedding.py +13 -13
- sglang/srt/layers/sampler.py +4 -8
- sglang/srt/layers/torchao_utils.py +2 -0
- sglang/srt/lora/lora.py +13 -14
- sglang/srt/lora/lora_config.py +13 -14
- sglang/srt/lora/lora_manager.py +22 -24
- sglang/srt/managers/data_parallel_controller.py +98 -27
- sglang/srt/managers/detokenizer_manager.py +13 -15
- sglang/srt/managers/io_struct.py +63 -21
- sglang/srt/managers/schedule_batch.py +154 -59
- sglang/srt/managers/schedule_policy.py +18 -16
- sglang/srt/managers/scheduler.py +278 -109
- sglang/srt/managers/session_controller.py +61 -0
- sglang/srt/managers/tokenizer_manager.py +63 -18
- sglang/srt/managers/tp_worker.py +25 -16
- sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
- sglang/srt/metrics/collector.py +13 -15
- sglang/srt/metrics/func_timer.py +13 -15
- sglang/srt/mm_utils.py +13 -14
- sglang/srt/model_executor/cuda_graph_runner.py +63 -25
- sglang/srt/model_executor/forward_batch_info.py +128 -32
- sglang/srt/model_executor/model_runner.py +132 -64
- sglang/srt/model_parallel.py +98 -0
- sglang/srt/models/chatglm.py +15 -16
- sglang/srt/models/commandr.py +15 -16
- sglang/srt/models/dbrx.py +15 -16
- sglang/srt/models/deepseek.py +15 -15
- sglang/srt/models/deepseek_v2.py +162 -59
- sglang/srt/models/exaone.py +14 -15
- sglang/srt/models/gemma.py +14 -14
- sglang/srt/models/gemma2.py +31 -25
- sglang/srt/models/gemma2_reward.py +13 -14
- sglang/srt/models/gpt_bigcode.py +14 -14
- sglang/srt/models/grok.py +15 -15
- sglang/srt/models/internlm2.py +13 -15
- sglang/srt/models/internlm2_reward.py +13 -14
- sglang/srt/models/llama.py +21 -21
- sglang/srt/models/llama_classification.py +13 -14
- sglang/srt/models/llama_reward.py +13 -14
- sglang/srt/models/llava.py +14 -16
- sglang/srt/models/llavavid.py +14 -16
- sglang/srt/models/minicpm.py +13 -15
- sglang/srt/models/minicpm3.py +13 -15
- sglang/srt/models/mistral.py +13 -15
- sglang/srt/models/mixtral.py +15 -15
- sglang/srt/models/mixtral_quant.py +14 -14
- sglang/srt/models/olmo.py +22 -20
- sglang/srt/models/olmoe.py +23 -20
- sglang/srt/models/phi3_small.py +447 -0
- sglang/srt/models/qwen.py +14 -14
- sglang/srt/models/qwen2.py +22 -19
- sglang/srt/models/qwen2_moe.py +17 -18
- sglang/srt/models/qwen2_vl.py +13 -6
- sglang/srt/models/stablelm.py +18 -16
- sglang/srt/models/torch_native_llama.py +107 -93
- sglang/srt/models/xverse.py +13 -14
- sglang/srt/models/xverse_moe.py +15 -16
- sglang/srt/models/yivl.py +13 -15
- sglang/srt/openai_api/adapter.py +19 -17
- sglang/srt/openai_api/protocol.py +14 -16
- sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
- sglang/srt/sampling/sampling_batch_info.py +61 -57
- sglang/srt/sampling/sampling_params.py +14 -16
- sglang/srt/server.py +86 -35
- sglang/srt/server_args.py +96 -80
- sglang/srt/utils.py +266 -68
- sglang/test/few_shot_gsm8k.py +8 -4
- sglang/test/runners.py +38 -20
- sglang/test/srt/sampling/penaltylib/utils.py +23 -21
- sglang/test/test_utils.py +31 -20
- sglang/version.py +1 -1
- {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
- {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
- sglang-0.3.6.post1.dist-info/RECORD +164 -0
- {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
- sglang/srt/layers/fused_moe/__init__.py +0 -1
- sglang-0.3.5.post2.dist-info/RECORD +0 -156
- {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0
sglang/srt/models/stablelm.py
CHANGED
@@ -1,22 +1,24 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
15
14
|
|
16
15
|
# Adapted from:
|
17
16
|
# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/stablelm.py#L1
|
18
|
-
"""
|
19
|
-
|
17
|
+
"""
|
18
|
+
Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
|
19
|
+
model compatible with HuggingFace weights.
|
20
|
+
"""
|
21
|
+
|
20
22
|
from typing import Iterable, Optional, Tuple
|
21
23
|
|
22
24
|
import torch
|
@@ -1,21 +1,44 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
15
14
|
|
16
15
|
# Adapted from
|
17
16
|
# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
|
18
|
-
"""
|
17
|
+
"""
|
18
|
+
Inference-only LLaMA model compatible with HuggingFace weights.
|
19
|
+
|
20
|
+
This model supports tensor parallelism (TP) using the PyTorch tensor parallel package.
|
21
|
+
Reference: https://pytorch.org/docs/stable/distributed.tensor.parallel.html
|
22
|
+
|
23
|
+
Here is a quick example to enable TP:
|
24
|
+
```python
|
25
|
+
from sglang.srt.model_parallel import tensor_parallel
|
26
|
+
|
27
|
+
device_mesh = torch.distributed.init_device_mesh("cuda", (tp_size,))
|
28
|
+
tensor_parallel(model, device_mesh)
|
29
|
+
```
|
30
|
+
|
31
|
+
An end-to-end example can be found in `python/sglang/bench_one_batch.py`.
|
32
|
+
You can run it with the following command:
|
33
|
+
```bash
|
34
|
+
$ python3 -m sglang.bench_one_batch --correct \
|
35
|
+
--model meta-llama/Meta-Llama-3-8B \
|
36
|
+
--json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}' \
|
37
|
+
--tensor-parallel-size 2 \
|
38
|
+
--disable-cuda-graph
|
39
|
+
```
|
40
|
+
We will eanble CUDA Graph support soon.
|
41
|
+
"""
|
19
42
|
|
20
43
|
import types
|
21
44
|
from typing import Any, Dict, Iterable, Optional, Tuple
|
@@ -24,7 +47,10 @@ import torch
|
|
24
47
|
from torch import nn
|
25
48
|
from torch.nn.parameter import Parameter
|
26
49
|
from transformers import LlamaConfig
|
27
|
-
from vllm.distributed import
|
50
|
+
from vllm.distributed import (
|
51
|
+
get_tensor_model_parallel_rank,
|
52
|
+
get_tensor_model_parallel_world_size,
|
53
|
+
)
|
28
54
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
29
55
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
30
56
|
|
@@ -41,35 +67,45 @@ from sglang.srt.layers.vocab_parallel_embedding import (
|
|
41
67
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
42
68
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
43
69
|
|
70
|
+
tp_size = get_tensor_model_parallel_world_size()
|
71
|
+
tp_rank = get_tensor_model_parallel_rank()
|
72
|
+
|
44
73
|
|
45
74
|
def gate_up_proj_weight_loader(
|
46
75
|
self,
|
47
76
|
param: Parameter,
|
48
77
|
loaded_weight: torch.Tensor,
|
49
|
-
loaded_shard_id:
|
78
|
+
loaded_shard_id: int,
|
50
79
|
):
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
80
|
+
# shard_id: (shard_offset, shard_size)
|
81
|
+
gate_up_offsets = {}
|
82
|
+
current_shard_offset = 0
|
83
|
+
for i, output_size in enumerate(self.output_sizes):
|
84
|
+
# Everything shrinks by tp_size if TP enabled
|
85
|
+
output_size = output_size // tp_size
|
86
|
+
gate_up_offsets[i] = (current_shard_offset, output_size)
|
87
|
+
current_shard_offset += output_size
|
88
|
+
# Re-size the param to the size after TP
|
89
|
+
if current_shard_offset != param.shape[0]:
|
90
|
+
# The clone will free the original, full tensor
|
91
|
+
param.data = param.data.narrow(0, 0, current_shard_offset).clone()
|
92
|
+
|
93
|
+
# Now load gate or up
|
94
|
+
assert loaded_shard_id < len(self.output_sizes)
|
95
|
+
param_data = param.data
|
96
|
+
shard_offset, shard_size = gate_up_offsets[loaded_shard_id]
|
97
|
+
param_data = param_data.narrow(0, shard_offset, shard_size)
|
98
|
+
loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
|
99
|
+
assert param_data.shape == loaded_weight.shape
|
100
|
+
param_data.copy_(loaded_weight)
|
70
101
|
|
71
102
|
|
72
103
|
class LlamaMLP(nn.Module):
|
104
|
+
_tp_plan = {
|
105
|
+
"gate_up_proj": "Colwise_Sharded",
|
106
|
+
"down_proj": "Rowwise",
|
107
|
+
}
|
108
|
+
|
73
109
|
def __init__(
|
74
110
|
self,
|
75
111
|
hidden_size: int,
|
@@ -104,62 +140,44 @@ class LlamaMLP(nn.Module):
|
|
104
140
|
return x
|
105
141
|
|
106
142
|
|
107
|
-
def _get_shard_offset_mapping(self, loaded_shard_id: str):
|
108
|
-
shard_offset_mapping = {
|
109
|
-
"q": 0,
|
110
|
-
"k": self.num_heads * self.head_size,
|
111
|
-
"v": (self.num_heads + self.num_kv_heads) * self.head_size,
|
112
|
-
"total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
|
113
|
-
}
|
114
|
-
return shard_offset_mapping.get(loaded_shard_id)
|
115
|
-
|
116
|
-
|
117
|
-
def _get_shard_size_mapping(self, loaded_shard_id: str):
|
118
|
-
shard_size_mapping = {
|
119
|
-
"q": self.num_heads * self.head_size,
|
120
|
-
"k": self.num_kv_heads * self.head_size,
|
121
|
-
"v": self.num_kv_heads * self.head_size,
|
122
|
-
}
|
123
|
-
return shard_size_mapping.get(loaded_shard_id)
|
124
|
-
|
125
|
-
|
126
143
|
def qkv_proj_weight_loader(
|
127
144
|
self,
|
128
145
|
param: Parameter,
|
129
146
|
loaded_weight: torch.Tensor,
|
130
|
-
loaded_shard_id:
|
147
|
+
loaded_shard_id: str,
|
131
148
|
):
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
param_data = param_data.narrow(0, shard_offset, shard_size)
|
157
|
-
assert param_data.shape == loaded_weight.shape
|
158
|
-
param_data.copy_(loaded_weight)
|
159
|
-
return
|
149
|
+
num_heads = self.num_heads // tp_size
|
150
|
+
num_kv_heads = self.num_kv_heads // tp_size
|
151
|
+
# shard_id: (shard_offset, shard_size)
|
152
|
+
qkv_offsets = {
|
153
|
+
"q": (0, num_heads * self.head_size),
|
154
|
+
"k": (num_heads * self.head_size, num_kv_heads * self.head_size),
|
155
|
+
"v": (
|
156
|
+
(num_heads + num_kv_heads) * self.head_size,
|
157
|
+
num_kv_heads * self.head_size,
|
158
|
+
),
|
159
|
+
}
|
160
|
+
total_size = qkv_offsets["v"][0] + qkv_offsets["v"][1]
|
161
|
+
# Re-size the param to the size after TP
|
162
|
+
if total_size != param.shape[0]:
|
163
|
+
# The clone will free the original, full tensor
|
164
|
+
param.data = param.data.narrow(0, 0, total_size).clone()
|
165
|
+
|
166
|
+
# Now load q, k or v
|
167
|
+
shard_offset, shard_size = qkv_offsets[loaded_shard_id]
|
168
|
+
param_data = param.data
|
169
|
+
param_data = param_data.narrow(0, shard_offset, shard_size)
|
170
|
+
loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
|
171
|
+
assert param_data.shape == loaded_weight.shape
|
172
|
+
param_data.copy_(loaded_weight)
|
160
173
|
|
161
174
|
|
162
175
|
class LlamaAttention(nn.Module):
|
176
|
+
_tp_plan = {
|
177
|
+
"qkv_proj": "Colwise_Sharded",
|
178
|
+
"o_proj": "Rowwise",
|
179
|
+
}
|
180
|
+
|
163
181
|
def __init__(
|
164
182
|
self,
|
165
183
|
config: LlamaConfig,
|
@@ -176,7 +194,6 @@ class LlamaAttention(nn.Module):
|
|
176
194
|
) -> None:
|
177
195
|
super().__init__()
|
178
196
|
self.hidden_size = hidden_size
|
179
|
-
tp_size = get_tensor_model_parallel_world_size()
|
180
197
|
self.total_num_heads = num_heads
|
181
198
|
assert self.total_num_heads % tp_size == 0
|
182
199
|
self.num_heads = self.total_num_heads // tp_size
|
@@ -205,20 +222,12 @@ class LlamaAttention(nn.Module):
|
|
205
222
|
(self.total_num_heads + 2 * self.total_num_kv_heads) * self.head_dim,
|
206
223
|
bias=False,
|
207
224
|
)
|
208
|
-
self.qkv_proj.total_num_heads = self.total_num_heads
|
209
225
|
self.qkv_proj.head_size = self.head_dim
|
210
|
-
self.qkv_proj.total_num_kv_heads = self.total_num_kv_heads
|
211
226
|
self.qkv_proj.num_heads = self.total_num_heads
|
212
227
|
self.qkv_proj.num_kv_heads = self.total_num_kv_heads
|
213
228
|
self.qkv_proj.weight_loader = types.MethodType(
|
214
229
|
qkv_proj_weight_loader, self.qkv_proj
|
215
230
|
)
|
216
|
-
self.qkv_proj._get_shard_offset_mapping = types.MethodType(
|
217
|
-
_get_shard_offset_mapping, self.qkv_proj
|
218
|
-
)
|
219
|
-
self.qkv_proj._get_shard_size_mapping = types.MethodType(
|
220
|
-
_get_shard_size_mapping, self.qkv_proj
|
221
|
-
)
|
222
231
|
self.qkv_proj.weight.weight_loader = self.qkv_proj.weight_loader
|
223
232
|
self.qkv_proj.weight.output_dim = 0
|
224
233
|
self.o_proj = torch.nn.Linear(
|
@@ -385,10 +394,15 @@ class TorchNativeLlamaForCausalLM(nn.Module):
|
|
385
394
|
self.config = config
|
386
395
|
self.quant_config = quant_config
|
387
396
|
self.torchao_config = global_server_args_dict["torchao_config"]
|
397
|
+
self.supports_torch_tp = True
|
388
398
|
self.model = LlamaModel(config, quant_config=quant_config)
|
389
399
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
390
400
|
self.logits_processor = LogitsProcessor(config)
|
391
401
|
|
402
|
+
# turning off autotune for fp8dq since it doesn't give speedup and
|
403
|
+
# increases compile time significantly
|
404
|
+
torch._inductor.config.max_autotune_gemm_backends = "ATEN"
|
405
|
+
|
392
406
|
@torch.no_grad()
|
393
407
|
def forward(
|
394
408
|
self,
|
sglang/srt/models/xverse.py
CHANGED
@@ -1,17 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
15
14
|
|
16
15
|
# Adapted from
|
17
16
|
# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/xverse.py#L1
|
sglang/srt/models/xverse_moe.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Inference-only XVERSE MoE model."""
|
15
|
+
|
17
16
|
from typing import Any, Dict, Iterable, Optional, Tuple
|
18
17
|
|
19
18
|
import torch
|
@@ -25,7 +24,6 @@ from vllm.distributed import (
|
|
25
24
|
tensor_model_parallel_all_reduce,
|
26
25
|
)
|
27
26
|
from vllm.model_executor.layers.activation import SiluAndMul
|
28
|
-
from vllm.model_executor.layers.fused_moe import fused_moe
|
29
27
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
30
28
|
from vllm.model_executor.layers.linear import (
|
31
29
|
MergedColumnParallelLinear,
|
@@ -36,6 +34,7 @@ from vllm.model_executor.layers.linear import (
|
|
36
34
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
37
35
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
38
36
|
|
37
|
+
from sglang.srt.layers.fused_moe_triton import fused_moe
|
39
38
|
from sglang.srt.layers.logits_processor import LogitsProcessor
|
40
39
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
41
40
|
from sglang.srt.layers.radix_attention import RadixAttention
|
sglang/srt/models/yivl.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Inference-only Yi-VL model."""
|
17
15
|
|
18
16
|
from typing import Iterable, Optional, Tuple
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Conversion between OpenAI APIs and native SRT APIs"""
|
17
15
|
|
18
16
|
import asyncio
|
@@ -989,11 +987,15 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
|
|
989
987
|
output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
|
990
988
|
)
|
991
989
|
token_logprobs = []
|
992
|
-
for token, logprob in
|
990
|
+
for token_idx, (token, logprob) in enumerate(
|
991
|
+
zip(logprobs.tokens, logprobs.token_logprobs)
|
992
|
+
):
|
993
993
|
token_bytes = list(token.encode("utf-8"))
|
994
994
|
top_logprobs = []
|
995
995
|
if logprobs.top_logprobs:
|
996
|
-
for top_token, top_logprob in logprobs.top_logprobs[
|
996
|
+
for top_token, top_logprob in logprobs.top_logprobs[
|
997
|
+
token_idx
|
998
|
+
].items():
|
997
999
|
top_token_bytes = list(top_token.encode("utf-8"))
|
998
1000
|
top_logprobs.append(
|
999
1001
|
TopLogprob(
|
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Pydantic models for OpenAI API protocol"""
|
17
15
|
|
18
16
|
import time
|
@@ -236,7 +234,7 @@ ChatCompletionMessageContentPart = Union[
|
|
236
234
|
|
237
235
|
|
238
236
|
class ChatCompletionMessageGenericParam(BaseModel):
|
239
|
-
role: Literal["system", "assistant"]
|
237
|
+
role: Literal["system", "assistant", "tool"]
|
240
238
|
content: Union[str, List[ChatCompletionMessageContentTextPart]]
|
241
239
|
|
242
240
|
|