sglang 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. sglang/__init__.py +33 -26
  2. sglang/api.py +9 -1
  3. sglang/bench_latency.py +2 -2
  4. sglang/bench_serving.py +10 -1
  5. sglang/check_env.py +1 -1
  6. sglang/lang/backend/litellm.py +1 -1
  7. sglang/lang/backend/openai.py +1 -1
  8. sglang/lang/backend/runtime_endpoint.py +4 -4
  9. sglang/lang/interpreter.py +24 -9
  10. sglang/lang/ir.py +1 -1
  11. sglang/srt/constrained/__init__.py +15 -0
  12. sglang/srt/constrained/base_cache.py +15 -0
  13. sglang/srt/constrained/fsm_cache.py +36 -1
  14. sglang/srt/constrained/jump_forward.py +15 -0
  15. sglang/srt/conversation.py +26 -0
  16. sglang/srt/hf_transformers_utils.py +18 -1
  17. sglang/srt/layers/context_flashattention_nopad.py +15 -0
  18. sglang/srt/layers/extend_attention.py +15 -0
  19. sglang/srt/layers/fused_moe.py +15 -0
  20. sglang/srt/layers/linear.py +15 -0
  21. sglang/srt/layers/logits_processor.py +109 -72
  22. sglang/srt/layers/quantization/__init__.py +15 -0
  23. sglang/srt/layers/quantization/fp8.py +15 -0
  24. sglang/srt/layers/radix_attention.py +21 -3
  25. sglang/srt/layers/token_attention.py +16 -1
  26. sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
  27. sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
  28. sglang/srt/managers/detokenizer_manager.py +16 -1
  29. sglang/srt/managers/io_struct.py +38 -5
  30. sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
  31. sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +85 -25
  32. sglang/srt/managers/tokenizer_manager.py +99 -57
  33. sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +177 -81
  34. sglang/srt/mem_cache/flush_cache.py +33 -0
  35. sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
  36. sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
  37. sglang/srt/mm_utils.py +15 -0
  38. sglang/srt/model_config.py +20 -0
  39. sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +42 -18
  40. sglang/srt/{managers/controller → model_executor}/model_runner.py +51 -16
  41. sglang/srt/model_loader/model_loader.py +15 -0
  42. sglang/srt/model_loader/utils.py +16 -1
  43. sglang/srt/models/chatglm.py +16 -1
  44. sglang/srt/models/commandr.py +16 -1
  45. sglang/srt/models/dbrx.py +16 -1
  46. sglang/srt/models/deepseek.py +16 -1
  47. sglang/srt/models/deepseek_v2.py +532 -0
  48. sglang/srt/models/gemma.py +16 -1
  49. sglang/srt/models/gemma2.py +16 -1
  50. sglang/srt/models/gpt_bigcode.py +16 -1
  51. sglang/srt/models/grok.py +16 -1
  52. sglang/srt/models/internlm2.py +16 -1
  53. sglang/srt/models/llama2.py +16 -1
  54. sglang/srt/models/llama_classification.py +19 -4
  55. sglang/srt/models/llava.py +17 -2
  56. sglang/srt/models/llavavid.py +17 -2
  57. sglang/srt/models/minicpm.py +16 -1
  58. sglang/srt/models/mistral.py +15 -0
  59. sglang/srt/models/mixtral.py +16 -1
  60. sglang/srt/models/mixtral_quant.py +16 -1
  61. sglang/srt/models/qwen.py +16 -1
  62. sglang/srt/models/qwen2.py +16 -1
  63. sglang/srt/models/qwen2_moe.py +16 -1
  64. sglang/srt/models/stablelm.py +16 -1
  65. sglang/srt/models/yivl.py +15 -0
  66. sglang/srt/openai_api/adapter.py +545 -160
  67. sglang/srt/openai_api/protocol.py +65 -1
  68. sglang/srt/sampling_params.py +20 -4
  69. sglang/srt/server.py +90 -37
  70. sglang/srt/server_args.py +76 -17
  71. sglang/srt/utils.py +15 -0
  72. sglang/test/test_programs.py +5 -1
  73. sglang/utils.py +22 -0
  74. sglang/version.py +1 -1
  75. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/METADATA +40 -12
  76. sglang-0.2.7.dist-info/RECORD +93 -0
  77. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
  78. sglang/srt/flush_cache.py +0 -18
  79. sglang-0.2.5.dist-info/RECORD +0 -92
  80. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
  81. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Run the model with cuda graph."""
2
17
 
3
18
  import bisect
@@ -9,8 +24,12 @@ from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
9
24
  from vllm.distributed.parallel_state import graph_capture
10
25
  from vllm.model_executor.custom_op import CustomOp
11
26
 
12
- from sglang.srt.layers.logits_processor import LogitProcessorOutput
13
- from sglang.srt.managers.controller.infer_batch import (
27
+ from sglang.srt.layers.logits_processor import (
28
+ LogitProcessorOutput,
29
+ LogitsMetadata,
30
+ LogitsProcessor,
31
+ )
32
+ from sglang.srt.managers.schedule_batch import (
14
33
  Batch,
15
34
  ForwardMode,
16
35
  InputMetadata,
@@ -185,7 +204,6 @@ class CudaGraphRunner:
185
204
 
186
205
  def replay(self, batch: Batch):
187
206
  assert batch.out_cache_loc is not None
188
- assert not batch.return_logprob
189
207
  raw_bs = len(batch.reqs)
190
208
 
191
209
  # Pad
@@ -218,23 +236,29 @@ class CudaGraphRunner:
218
236
  output = self.output_buffers[bs]
219
237
 
220
238
  # Unpad
221
- if bs == raw_bs:
222
- return output
223
- else:
239
+ if bs != raw_bs:
224
240
  output = LogitProcessorOutput(
225
241
  next_token_logits=output.next_token_logits[:raw_bs],
226
- next_token_logprobs=(
227
- output.next_token_logprobs[:raw_bs]
228
- if output.next_token_logprobs is not None
229
- else None
230
- ),
242
+ next_token_logprobs=None,
231
243
  normalized_prompt_logprobs=None,
232
- prefill_token_logprobs=None,
233
- prefill_top_logprobs=None,
234
- decode_top_logprobs=(
235
- output.decode_top_logprobs[:raw_bs]
236
- if output.decode_top_logprobs is not None
237
- else None
238
- ),
244
+ input_token_logprobs=None,
245
+ input_top_logprobs=None,
246
+ output_top_logprobs=None,
247
+ )
248
+
249
+ # Extract logprobs
250
+ if batch.return_logprob:
251
+ output.next_token_logprobs = torch.nn.functional.log_softmax(
252
+ output.next_token_logits, dim=-1
239
253
  )
254
+ return_top_logprob = any(x > 0 for x in batch.top_logprobs_nums)
255
+ if return_top_logprob:
256
+ logits_metadata = LogitsMetadata(
257
+ forward_mode=ForwardMode.DECODE,
258
+ top_logprobs_nums=batch.top_logprobs_nums,
259
+ )
260
+ output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
261
+ output.next_token_logprobs, logits_metadata
262
+ )[1]
263
+
240
264
  return output
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """ModelRunner runs the forward passes of the models."""
2
17
 
3
18
  import importlib
@@ -25,8 +40,13 @@ from vllm.distributed import (
25
40
  from vllm.model_executor.models import ModelRegistry
26
41
 
27
42
  from sglang.global_config import global_config
28
- from sglang.srt.managers.controller.infer_batch import Batch, ForwardMode, InputMetadata
29
- from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
43
+ from sglang.srt.managers.schedule_batch import (
44
+ Batch,
45
+ ForwardMode,
46
+ InputMetadata,
47
+ global_server_args_dict,
48
+ )
49
+ from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool
30
50
  from sglang.srt.server_args import ServerArgs
31
51
  from sglang.srt.utils import (
32
52
  get_available_gpu_memory,
@@ -37,7 +57,7 @@ from sglang.srt.utils import (
37
57
  monkey_patch_vllm_qvk_linear_loader,
38
58
  )
39
59
 
40
- logger = logging.getLogger("srt.model_runner")
60
+ logger = logging.getLogger(__name__)
41
61
 
42
62
 
43
63
  class ModelRunner:
@@ -60,11 +80,17 @@ class ModelRunner:
60
80
  self.nccl_port = nccl_port
61
81
  self.server_args = server_args
62
82
  self.is_multimodal_model = is_multimodal_model(self.model_config)
63
- monkey_patch_vllm_dummy_weight_loader()
83
+ global_server_args_dict.update(
84
+ {
85
+ "disable_flashinfer": server_args.disable_flashinfer,
86
+ "disable_flashinfer_sampling": server_args.disable_flashinfer_sampling,
87
+ "attention_reduce_in_fp32": server_args.attention_reduce_in_fp32,
88
+ }
89
+ )
64
90
 
65
91
  # Init torch distributed
66
92
  torch.cuda.set_device(self.gpu_id)
67
- logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
93
+ logger.info(f"[gpu={self.gpu_id}] Init nccl begin.")
68
94
 
69
95
  if not server_args.enable_p2p_check:
70
96
  monkey_patch_vllm_p2p_access_check(self.gpu_id)
@@ -95,7 +121,7 @@ class ModelRunner:
95
121
 
96
122
  # Load the model and create memory pool
97
123
  self.load_model()
98
- self.init_memory_pool(total_gpu_memory)
124
+ self.init_memory_pool(total_gpu_memory, server_args.max_num_reqs)
99
125
  self.init_cublas()
100
126
  self.init_flash_infer()
101
127
 
@@ -104,10 +130,11 @@ class ModelRunner:
104
130
 
105
131
  def load_model(self):
106
132
  logger.info(
107
- f"[gpu_id={self.gpu_id}] Load weight begin. "
133
+ f"[gpu={self.gpu_id}] Load weight begin. "
108
134
  f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
109
135
  )
110
136
 
137
+ monkey_patch_vllm_dummy_weight_loader()
111
138
  device_config = DeviceConfig()
112
139
  load_config = LoadConfig(load_format=self.server_args.load_format)
113
140
  vllm_model_config = VllmModelConfig(
@@ -151,7 +178,7 @@ class ModelRunner:
151
178
  cache_config=None,
152
179
  )
153
180
  logger.info(
154
- f"[gpu_id={self.gpu_id}] Load weight end. "
181
+ f"[gpu={self.gpu_id}] Load weight end. "
155
182
  f"type={type(self.model).__name__}, "
156
183
  f"dtype={self.dtype}, "
157
184
  f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
@@ -176,7 +203,7 @@ class ModelRunner:
176
203
  max_num_token = int(rest_memory * (1 << 30) // cell_size)
177
204
  return max_num_token
178
205
 
179
- def init_memory_pool(self, total_gpu_memory):
206
+ def init_memory_pool(self, total_gpu_memory, max_num_reqs=None):
180
207
  self.max_total_num_tokens = self.profile_max_num_token(total_gpu_memory)
181
208
 
182
209
  if self.max_total_num_tokens <= 0:
@@ -184,11 +211,19 @@ class ModelRunner:
184
211
  "Not enough memory. Please try to increase --mem-fraction-static."
185
212
  )
186
213
 
214
+ if max_num_reqs is None:
215
+ max_num_reqs = min(
216
+ max(
217
+ int(
218
+ self.max_total_num_tokens / self.model_config.context_len * 512
219
+ ),
220
+ 2048,
221
+ ),
222
+ 5120,
223
+ )
224
+
187
225
  self.req_to_token_pool = ReqToTokenPool(
188
- max(
189
- int(self.max_total_num_tokens / self.model_config.context_len * 512),
190
- 2048,
191
- ),
226
+ max_num_reqs,
192
227
  self.model_config.context_len + 8,
193
228
  )
194
229
  self.token_to_kv_pool = TokenToKVPool(
@@ -199,7 +234,7 @@ class ModelRunner:
199
234
  layer_num=self.model_config.num_hidden_layers,
200
235
  )
201
236
  logger.info(
202
- f"[gpu_id={self.gpu_id}] Memory pool end. "
237
+ f"[gpu={self.gpu_id}] Memory pool end. "
203
238
  f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
204
239
  )
205
240
 
@@ -243,14 +278,14 @@ class ModelRunner:
243
278
  )
244
279
 
245
280
  def init_cuda_graphs(self):
246
- from sglang.srt.managers.controller.cuda_graph_runner import CudaGraphRunner
281
+ from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
247
282
 
248
283
  if self.server_args.disable_cuda_graph or self.server_args.disable_flashinfer:
249
284
  self.cuda_graph_runner = None
250
285
  return
251
286
 
252
287
  logger.info(
253
- f"[gpu_id={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
288
+ f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes."
254
289
  )
255
290
  batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)]
256
291
  self.cuda_graph_runner = CudaGraphRunner(
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # temporarily adapted from https://github.com/vllm-project/vllm/blob/10383887e03412196a2689b9398290719c4797bf/vllm/model_executor/model_loader/loader.py
2
17
  # FIXME: in progress of refactoring the model loader
3
18
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # temporarily adapted from vLLM
2
17
  # FIXME: in progress of refactoring the model loader
3
18
  """Utilities for selecting and loading models."""
@@ -23,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
23
38
 
24
39
  from sglang.srt.layers.quantization import get_quantization_config
25
40
 
26
- logger = logging.getLogger("srt.model_loader")
41
+ logger = logging.getLogger(__name__)
27
42
  temp_dir = tempfile.gettempdir()
28
43
 
29
44
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # coding=utf-8
2
17
  # Adapted from
3
18
  # https://github.com/THUDM/ChatGLM2-6B
@@ -30,7 +45,7 @@ from vllm.transformers_utils.configs import ChatGLMConfig
30
45
 
31
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
32
47
  from sglang.srt.layers.radix_attention import RadixAttention
33
- from sglang.srt.managers.controller.model_runner import InputMetadata
48
+ from sglang.srt.model_executor.model_runner import InputMetadata
34
49
 
35
50
  LoraConfig = None
36
51
 
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # coding=utf-8
2
17
  # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
3
18
  #
@@ -49,7 +64,7 @@ from vllm.model_executor.utils import set_weight_attrs
49
64
 
50
65
  from sglang.srt.layers.logits_processor import LogitsProcessor
51
66
  from sglang.srt.layers.radix_attention import RadixAttention
52
- from sglang.srt.managers.controller.model_runner import InputMetadata
67
+ from sglang.srt.model_executor.model_runner import InputMetadata
53
68
 
54
69
 
55
70
  @torch.compile
sglang/srt/models/dbrx.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from:
2
17
  # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/dbrx.py#L1
3
18
  # coding=utf-8
@@ -30,7 +45,7 @@ from vllm.transformers_utils.configs.dbrx import DbrxConfig
30
45
 
31
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
32
47
  from sglang.srt.layers.radix_attention import RadixAttention
33
- from sglang.srt.managers.controller.model_runner import InputMetadata
48
+ from sglang.srt.model_executor.model_runner import InputMetadata
34
49
 
35
50
 
36
51
  class DbrxRouter(nn.Module):
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  # Adapted from:
2
17
  # https://github.com/vllm-project/vllm/blob/14f91fe67c2342f2fe859dc6a5c40810df0e1c61/vllm/model_executor/models/deepseek.py
3
18
  """Inference-only Deepseek model."""
@@ -31,7 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
31
46
 
32
47
  from sglang.srt.layers.logits_processor import LogitsProcessor
33
48
  from sglang.srt.layers.radix_attention import RadixAttention
34
- from sglang.srt.managers.controller.infer_batch import InputMetadata
49
+ from sglang.srt.managers.schedule_batch import InputMetadata
35
50
 
36
51
 
37
52
  class DeepseekMLP(nn.Module):