sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/api.py +2 -2
  3. sglang/bench_latency.py +1 -553
  4. sglang/bench_offline_throughput.py +48 -20
  5. sglang/bench_one_batch.py +472 -0
  6. sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
  7. sglang/bench_serving.py +125 -6
  8. sglang/check_env.py +3 -6
  9. sglang/lang/backend/base_backend.py +1 -1
  10. sglang/lang/backend/runtime_endpoint.py +2 -2
  11. sglang/srt/configs/model_config.py +13 -14
  12. sglang/srt/constrained/__init__.py +13 -14
  13. sglang/srt/constrained/base_grammar_backend.py +13 -15
  14. sglang/srt/constrained/outlines_backend.py +28 -17
  15. sglang/srt/constrained/outlines_jump_forward.py +13 -15
  16. sglang/srt/constrained/xgrammar_backend.py +47 -58
  17. sglang/srt/conversation.py +13 -15
  18. sglang/srt/hf_transformers_utils.py +13 -15
  19. sglang/srt/layers/activation.py +16 -13
  20. sglang/srt/layers/attention/flashinfer_backend.py +106 -54
  21. sglang/srt/layers/attention/triton_backend.py +9 -7
  22. sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
  23. sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
  24. sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
  25. sglang/srt/layers/custom_op_util.py +25 -0
  26. sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
  27. sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
  28. sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
  29. sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
  30. sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
  31. sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
  32. sglang/srt/layers/fused_moe_triton/layer.py +633 -0
  33. sglang/srt/layers/layernorm.py +17 -15
  34. sglang/srt/layers/logits_processor.py +23 -25
  35. sglang/srt/layers/quantization/__init__.py +77 -17
  36. sglang/srt/layers/radix_attention.py +13 -15
  37. sglang/srt/layers/rotary_embedding.py +13 -13
  38. sglang/srt/layers/sampler.py +4 -8
  39. sglang/srt/layers/torchao_utils.py +2 -0
  40. sglang/srt/lora/lora.py +13 -14
  41. sglang/srt/lora/lora_config.py +13 -14
  42. sglang/srt/lora/lora_manager.py +22 -24
  43. sglang/srt/managers/data_parallel_controller.py +98 -27
  44. sglang/srt/managers/detokenizer_manager.py +13 -15
  45. sglang/srt/managers/io_struct.py +63 -21
  46. sglang/srt/managers/schedule_batch.py +154 -59
  47. sglang/srt/managers/schedule_policy.py +18 -16
  48. sglang/srt/managers/scheduler.py +278 -109
  49. sglang/srt/managers/session_controller.py +61 -0
  50. sglang/srt/managers/tokenizer_manager.py +63 -18
  51. sglang/srt/managers/tp_worker.py +25 -16
  52. sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
  53. sglang/srt/metrics/collector.py +13 -15
  54. sglang/srt/metrics/func_timer.py +13 -15
  55. sglang/srt/mm_utils.py +13 -14
  56. sglang/srt/model_executor/cuda_graph_runner.py +63 -25
  57. sglang/srt/model_executor/forward_batch_info.py +128 -32
  58. sglang/srt/model_executor/model_runner.py +132 -64
  59. sglang/srt/model_parallel.py +98 -0
  60. sglang/srt/models/chatglm.py +15 -16
  61. sglang/srt/models/commandr.py +15 -16
  62. sglang/srt/models/dbrx.py +15 -16
  63. sglang/srt/models/deepseek.py +15 -15
  64. sglang/srt/models/deepseek_v2.py +162 -59
  65. sglang/srt/models/exaone.py +14 -15
  66. sglang/srt/models/gemma.py +14 -14
  67. sglang/srt/models/gemma2.py +31 -25
  68. sglang/srt/models/gemma2_reward.py +13 -14
  69. sglang/srt/models/gpt_bigcode.py +14 -14
  70. sglang/srt/models/grok.py +15 -15
  71. sglang/srt/models/internlm2.py +13 -15
  72. sglang/srt/models/internlm2_reward.py +13 -14
  73. sglang/srt/models/llama.py +21 -21
  74. sglang/srt/models/llama_classification.py +13 -14
  75. sglang/srt/models/llama_reward.py +13 -14
  76. sglang/srt/models/llava.py +14 -16
  77. sglang/srt/models/llavavid.py +14 -16
  78. sglang/srt/models/minicpm.py +13 -15
  79. sglang/srt/models/minicpm3.py +13 -15
  80. sglang/srt/models/mistral.py +13 -15
  81. sglang/srt/models/mixtral.py +15 -15
  82. sglang/srt/models/mixtral_quant.py +14 -14
  83. sglang/srt/models/olmo.py +22 -20
  84. sglang/srt/models/olmoe.py +23 -20
  85. sglang/srt/models/phi3_small.py +447 -0
  86. sglang/srt/models/qwen.py +14 -14
  87. sglang/srt/models/qwen2.py +22 -19
  88. sglang/srt/models/qwen2_moe.py +17 -18
  89. sglang/srt/models/qwen2_vl.py +13 -6
  90. sglang/srt/models/stablelm.py +18 -16
  91. sglang/srt/models/torch_native_llama.py +107 -93
  92. sglang/srt/models/xverse.py +13 -14
  93. sglang/srt/models/xverse_moe.py +15 -16
  94. sglang/srt/models/yivl.py +13 -15
  95. sglang/srt/openai_api/adapter.py +19 -17
  96. sglang/srt/openai_api/protocol.py +14 -16
  97. sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
  98. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
  99. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
  100. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
  101. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
  102. sglang/srt/sampling/sampling_batch_info.py +61 -57
  103. sglang/srt/sampling/sampling_params.py +14 -16
  104. sglang/srt/server.py +86 -35
  105. sglang/srt/server_args.py +96 -80
  106. sglang/srt/utils.py +266 -68
  107. sglang/test/few_shot_gsm8k.py +8 -4
  108. sglang/test/runners.py +38 -20
  109. sglang/test/srt/sampling/penaltylib/utils.py +23 -21
  110. sglang/test/test_utils.py +31 -20
  111. sglang/version.py +1 -1
  112. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
  113. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
  114. sglang-0.3.6.post1.dist-info/RECORD +164 -0
  115. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
  116. sglang/srt/layers/fused_moe/__init__.py +0 -1
  117. sglang-0.3.5.post2.dist-info/RECORD +0 -156
  118. {sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0
@@ -1,22 +1,24 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  # Adapted from:
17
16
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/stablelm.py#L1
18
- """Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
19
- model compatible with HuggingFace weights."""
17
+ """
18
+ Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
19
+ model compatible with HuggingFace weights.
20
+ """
21
+
20
22
  from typing import Iterable, Optional, Tuple
21
23
 
22
24
  import torch
@@ -1,21 +1,44 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  # Adapted from
17
16
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
18
- """Inference-only LLaMA model compatible with HuggingFace weights."""
17
+ """
18
+ Inference-only LLaMA model compatible with HuggingFace weights.
19
+
20
+ This model supports tensor parallelism (TP) using the PyTorch tensor parallel package.
21
+ Reference: https://pytorch.org/docs/stable/distributed.tensor.parallel.html
22
+
23
+ Here is a quick example to enable TP:
24
+ ```python
25
+ from sglang.srt.model_parallel import tensor_parallel
26
+
27
+ device_mesh = torch.distributed.init_device_mesh("cuda", (tp_size,))
28
+ tensor_parallel(model, device_mesh)
29
+ ```
30
+
31
+ An end-to-end example can be found in `python/sglang/bench_one_batch.py`.
32
+ You can run it with the following command:
33
+ ```bash
34
+ $ python3 -m sglang.bench_one_batch --correct \
35
+ --model meta-llama/Meta-Llama-3-8B \
36
+ --json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}' \
37
+ --tensor-parallel-size 2 \
38
+ --disable-cuda-graph
39
+ ```
40
+ We will eanble CUDA Graph support soon.
41
+ """
19
42
 
20
43
  import types
21
44
  from typing import Any, Dict, Iterable, Optional, Tuple
@@ -24,7 +47,10 @@ import torch
24
47
  from torch import nn
25
48
  from torch.nn.parameter import Parameter
26
49
  from transformers import LlamaConfig
27
- from vllm.distributed import get_tensor_model_parallel_world_size
50
+ from vllm.distributed import (
51
+ get_tensor_model_parallel_rank,
52
+ get_tensor_model_parallel_world_size,
53
+ )
28
54
  from vllm.model_executor.layers.rotary_embedding import get_rope
29
55
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
30
56
 
@@ -41,35 +67,45 @@ from sglang.srt.layers.vocab_parallel_embedding import (
41
67
  from sglang.srt.managers.schedule_batch import global_server_args_dict
42
68
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
43
69
 
70
+ tp_size = get_tensor_model_parallel_world_size()
71
+ tp_rank = get_tensor_model_parallel_rank()
72
+
44
73
 
45
74
  def gate_up_proj_weight_loader(
46
75
  self,
47
76
  param: Parameter,
48
77
  loaded_weight: torch.Tensor,
49
- loaded_shard_id: Optional[int] = None,
78
+ loaded_shard_id: int,
50
79
  ):
51
- if loaded_shard_id is None:
52
- shard_offsets: List[Tuple[int, int, int]] = []
53
- for i, output_size in enumerate(self.output_sizes):
54
- shard_offsets.append((i, current_shard_offset, output_size))
55
- current_shard_offset += output_size
56
- for shard_id, shard_offset, shard_size in shard_offsets:
57
- loaded_weight_shard = loaded_weight.narrow(
58
- output_dim, shard_offset, shard_size
59
- )
60
- self.weight_loader(param, loaded_weight_shard, shard_id)
61
- else:
62
- assert loaded_shard_id < len(self.output_sizes)
63
- param_data = param.data
64
- shard_size = loaded_weight.shape[0]
65
- shard_offset = loaded_shard_id * shard_size
66
- param_data = param_data.narrow(0, shard_offset, shard_size)
67
- assert param_data.shape == loaded_weight.shape
68
- param_data.copy_(loaded_weight)
69
- return
80
+ # shard_id: (shard_offset, shard_size)
81
+ gate_up_offsets = {}
82
+ current_shard_offset = 0
83
+ for i, output_size in enumerate(self.output_sizes):
84
+ # Everything shrinks by tp_size if TP enabled
85
+ output_size = output_size // tp_size
86
+ gate_up_offsets[i] = (current_shard_offset, output_size)
87
+ current_shard_offset += output_size
88
+ # Re-size the param to the size after TP
89
+ if current_shard_offset != param.shape[0]:
90
+ # The clone will free the original, full tensor
91
+ param.data = param.data.narrow(0, 0, current_shard_offset).clone()
92
+
93
+ # Now load gate or up
94
+ assert loaded_shard_id < len(self.output_sizes)
95
+ param_data = param.data
96
+ shard_offset, shard_size = gate_up_offsets[loaded_shard_id]
97
+ param_data = param_data.narrow(0, shard_offset, shard_size)
98
+ loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
99
+ assert param_data.shape == loaded_weight.shape
100
+ param_data.copy_(loaded_weight)
70
101
 
71
102
 
72
103
  class LlamaMLP(nn.Module):
104
+ _tp_plan = {
105
+ "gate_up_proj": "Colwise_Sharded",
106
+ "down_proj": "Rowwise",
107
+ }
108
+
73
109
  def __init__(
74
110
  self,
75
111
  hidden_size: int,
@@ -104,62 +140,44 @@ class LlamaMLP(nn.Module):
104
140
  return x
105
141
 
106
142
 
107
- def _get_shard_offset_mapping(self, loaded_shard_id: str):
108
- shard_offset_mapping = {
109
- "q": 0,
110
- "k": self.num_heads * self.head_size,
111
- "v": (self.num_heads + self.num_kv_heads) * self.head_size,
112
- "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size,
113
- }
114
- return shard_offset_mapping.get(loaded_shard_id)
115
-
116
-
117
- def _get_shard_size_mapping(self, loaded_shard_id: str):
118
- shard_size_mapping = {
119
- "q": self.num_heads * self.head_size,
120
- "k": self.num_kv_heads * self.head_size,
121
- "v": self.num_kv_heads * self.head_size,
122
- }
123
- return shard_size_mapping.get(loaded_shard_id)
124
-
125
-
126
143
  def qkv_proj_weight_loader(
127
144
  self,
128
145
  param: Parameter,
129
146
  loaded_weight: torch.Tensor,
130
- loaded_shard_id: Optional[str] = None,
147
+ loaded_shard_id: str,
131
148
  ):
132
- if loaded_shard_id is None:
133
- shard_offsets = [
134
- # (shard_id, shard_offset, shard_size)
135
- ("q", 0, self.total_num_heads * self.head_size),
136
- (
137
- "k",
138
- self.total_num_heads * self.head_size,
139
- self.total_num_kv_heads * self.head_size,
140
- ),
141
- (
142
- "v",
143
- (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
144
- self.total_num_kv_heads * self.head_size,
145
- ),
146
- ]
147
- for shard_id, shard_offset, shard_size in shard_offsets:
148
- loaded_weight_shard = loaded_weight.narrow(
149
- param.output_dim, shard_offset, shard_size
150
- )
151
- self.weight_loader(param, loaded_weight_shard, shard_id)
152
- else:
153
- shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
154
- shard_size = self._get_shard_size_mapping(loaded_shard_id)
155
- param_data = param.data
156
- param_data = param_data.narrow(0, shard_offset, shard_size)
157
- assert param_data.shape == loaded_weight.shape
158
- param_data.copy_(loaded_weight)
159
- return
149
+ num_heads = self.num_heads // tp_size
150
+ num_kv_heads = self.num_kv_heads // tp_size
151
+ # shard_id: (shard_offset, shard_size)
152
+ qkv_offsets = {
153
+ "q": (0, num_heads * self.head_size),
154
+ "k": (num_heads * self.head_size, num_kv_heads * self.head_size),
155
+ "v": (
156
+ (num_heads + num_kv_heads) * self.head_size,
157
+ num_kv_heads * self.head_size,
158
+ ),
159
+ }
160
+ total_size = qkv_offsets["v"][0] + qkv_offsets["v"][1]
161
+ # Re-size the param to the size after TP
162
+ if total_size != param.shape[0]:
163
+ # The clone will free the original, full tensor
164
+ param.data = param.data.narrow(0, 0, total_size).clone()
165
+
166
+ # Now load q, k or v
167
+ shard_offset, shard_size = qkv_offsets[loaded_shard_id]
168
+ param_data = param.data
169
+ param_data = param_data.narrow(0, shard_offset, shard_size)
170
+ loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
171
+ assert param_data.shape == loaded_weight.shape
172
+ param_data.copy_(loaded_weight)
160
173
 
161
174
 
162
175
  class LlamaAttention(nn.Module):
176
+ _tp_plan = {
177
+ "qkv_proj": "Colwise_Sharded",
178
+ "o_proj": "Rowwise",
179
+ }
180
+
163
181
  def __init__(
164
182
  self,
165
183
  config: LlamaConfig,
@@ -176,7 +194,6 @@ class LlamaAttention(nn.Module):
176
194
  ) -> None:
177
195
  super().__init__()
178
196
  self.hidden_size = hidden_size
179
- tp_size = get_tensor_model_parallel_world_size()
180
197
  self.total_num_heads = num_heads
181
198
  assert self.total_num_heads % tp_size == 0
182
199
  self.num_heads = self.total_num_heads // tp_size
@@ -205,20 +222,12 @@ class LlamaAttention(nn.Module):
205
222
  (self.total_num_heads + 2 * self.total_num_kv_heads) * self.head_dim,
206
223
  bias=False,
207
224
  )
208
- self.qkv_proj.total_num_heads = self.total_num_heads
209
225
  self.qkv_proj.head_size = self.head_dim
210
- self.qkv_proj.total_num_kv_heads = self.total_num_kv_heads
211
226
  self.qkv_proj.num_heads = self.total_num_heads
212
227
  self.qkv_proj.num_kv_heads = self.total_num_kv_heads
213
228
  self.qkv_proj.weight_loader = types.MethodType(
214
229
  qkv_proj_weight_loader, self.qkv_proj
215
230
  )
216
- self.qkv_proj._get_shard_offset_mapping = types.MethodType(
217
- _get_shard_offset_mapping, self.qkv_proj
218
- )
219
- self.qkv_proj._get_shard_size_mapping = types.MethodType(
220
- _get_shard_size_mapping, self.qkv_proj
221
- )
222
231
  self.qkv_proj.weight.weight_loader = self.qkv_proj.weight_loader
223
232
  self.qkv_proj.weight.output_dim = 0
224
233
  self.o_proj = torch.nn.Linear(
@@ -385,10 +394,15 @@ class TorchNativeLlamaForCausalLM(nn.Module):
385
394
  self.config = config
386
395
  self.quant_config = quant_config
387
396
  self.torchao_config = global_server_args_dict["torchao_config"]
397
+ self.supports_torch_tp = True
388
398
  self.model = LlamaModel(config, quant_config=quant_config)
389
399
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
390
400
  self.logits_processor = LogitsProcessor(config)
391
401
 
402
+ # turning off autotune for fp8dq since it doesn't give speedup and
403
+ # increases compile time significantly
404
+ torch._inductor.config.max_autotune_gemm_backends = "ATEN"
405
+
392
406
  @torch.no_grad()
393
407
  def forward(
394
408
  self,
@@ -1,17 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  # Adapted from
17
16
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/xverse.py#L1
@@ -1,19 +1,18 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Inference-only XVERSE MoE model."""
15
+
17
16
  from typing import Any, Dict, Iterable, Optional, Tuple
18
17
 
19
18
  import torch
@@ -25,7 +24,6 @@ from vllm.distributed import (
25
24
  tensor_model_parallel_all_reduce,
26
25
  )
27
26
  from vllm.model_executor.layers.activation import SiluAndMul
28
- from vllm.model_executor.layers.fused_moe import fused_moe
29
27
  from vllm.model_executor.layers.layernorm import RMSNorm
30
28
  from vllm.model_executor.layers.linear import (
31
29
  MergedColumnParallelLinear,
@@ -36,6 +34,7 @@ from vllm.model_executor.layers.linear import (
36
34
  from vllm.model_executor.layers.rotary_embedding import get_rope
37
35
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
38
36
 
37
+ from sglang.srt.layers.fused_moe_triton import fused_moe
39
38
  from sglang.srt.layers.logits_processor import LogitsProcessor
40
39
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
41
40
  from sglang.srt.layers.radix_attention import RadixAttention
sglang/srt/models/yivl.py CHANGED
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Inference-only Yi-VL model."""
17
15
 
18
16
  from typing import Iterable, Optional, Tuple
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Conversion between OpenAI APIs and native SRT APIs"""
17
15
 
18
16
  import asyncio
@@ -989,11 +987,15 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
989
987
  output_top_logprobs=ret_item["meta_info"]["output_top_logprobs"],
990
988
  )
991
989
  token_logprobs = []
992
- for token, logprob in zip(logprobs.tokens, logprobs.token_logprobs):
990
+ for token_idx, (token, logprob) in enumerate(
991
+ zip(logprobs.tokens, logprobs.token_logprobs)
992
+ ):
993
993
  token_bytes = list(token.encode("utf-8"))
994
994
  top_logprobs = []
995
995
  if logprobs.top_logprobs:
996
- for top_token, top_logprob in logprobs.top_logprobs[0].items():
996
+ for top_token, top_logprob in logprobs.top_logprobs[
997
+ token_idx
998
+ ].items():
997
999
  top_token_bytes = list(top_token.encode("utf-8"))
998
1000
  top_logprobs.append(
999
1001
  TopLogprob(
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Pydantic models for OpenAI API protocol"""
17
15
 
18
16
  import time
@@ -236,7 +234,7 @@ ChatCompletionMessageContentPart = Union[
236
234
 
237
235
 
238
236
  class ChatCompletionMessageGenericParam(BaseModel):
239
- role: Literal["system", "assistant"]
237
+ role: Literal["system", "assistant", "tool"]
240
238
  content: Union[str, List[ChatCompletionMessageContentTextPart]]
241
239
 
242
240