sglang 0.4.2__py3-none-any.whl → 0.4.2.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. sglang/srt/constrained/outlines_backend.py +9 -1
  2. sglang/srt/custom_op.py +40 -0
  3. sglang/srt/entrypoints/engine.py +2 -2
  4. sglang/srt/layers/activation.py +10 -5
  5. sglang/srt/layers/attention/flashinfer_backend.py +284 -39
  6. sglang/srt/layers/attention/triton_backend.py +71 -7
  7. sglang/srt/layers/attention/triton_ops/decode_attention.py +53 -59
  8. sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
  9. sglang/srt/layers/attention/vision.py +243 -40
  10. sglang/srt/layers/layernorm.py +1 -5
  11. sglang/srt/layers/moe/ep_moe/layer.py +1 -3
  12. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  13. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  14. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +200 -0
  15. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +200 -0
  16. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +200 -0
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +178 -0
  18. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +200 -0
  19. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +175 -0
  20. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +3 -11
  21. sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -3
  22. sglang/srt/layers/moe/topk.py +4 -0
  23. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  24. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  25. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  26. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  27. sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  28. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  29. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  30. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  31. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  32. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  33. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  34. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  35. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  36. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  37. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  38. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  39. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  40. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  41. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  42. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  44. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  45. sglang/srt/layers/quantization/fp8.py +7 -0
  46. sglang/srt/layers/quantization/fp8_kernel.py +140 -2
  47. sglang/srt/layers/rotary_embedding.py +29 -15
  48. sglang/srt/layers/sampler.py +9 -6
  49. sglang/srt/lora/backend/__init__.py +8 -0
  50. sglang/srt/lora/backend/base_backend.py +95 -0
  51. sglang/srt/lora/backend/flashinfer_backend.py +91 -0
  52. sglang/srt/lora/backend/triton_backend.py +61 -0
  53. sglang/srt/lora/lora.py +127 -112
  54. sglang/srt/lora/lora_manager.py +50 -18
  55. sglang/srt/lora/triton_ops/__init__.py +5 -0
  56. sglang/srt/lora/triton_ops/qkv_lora_b.py +182 -0
  57. sglang/srt/lora/triton_ops/sgemm_lora_a.py +143 -0
  58. sglang/srt/lora/triton_ops/sgemm_lora_b.py +159 -0
  59. sglang/srt/managers/image_processor.py +77 -38
  60. sglang/srt/managers/scheduler.py +17 -3
  61. sglang/srt/mem_cache/base_prefix_cache.py +4 -0
  62. sglang/srt/mem_cache/chunk_cache.py +3 -0
  63. sglang/srt/mem_cache/radix_cache.py +30 -1
  64. sglang/srt/model_executor/cuda_graph_runner.py +77 -80
  65. sglang/srt/model_executor/forward_batch_info.py +58 -59
  66. sglang/srt/model_executor/model_runner.py +2 -2
  67. sglang/srt/models/minicpmv.py +129 -76
  68. sglang/srt/models/mllama.py +16 -56
  69. sglang/srt/models/qwen2.py +4 -1
  70. sglang/srt/models/qwen2_vl.py +19 -9
  71. sglang/srt/server_args.py +19 -2
  72. sglang/srt/speculative/build_eagle_tree.py +4 -2
  73. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +213 -0
  74. sglang/srt/speculative/eagle_utils.py +361 -372
  75. sglang/srt/speculative/eagle_worker.py +177 -45
  76. sglang/srt/utils.py +7 -2
  77. sglang/test/runners.py +2 -0
  78. sglang/utils.py +42 -0
  79. sglang/version.py +1 -1
  80. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/METADATA +16 -7
  81. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/RECORD +84 -45
  82. sglang/srt/layers/custom_op_util.py +0 -25
  83. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/LICENSE +0 -0
  84. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/WHEEL +0 -0
  85. {sglang-0.4.2.dist-info → sglang-0.4.2.post2.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
1
+ import logging
2
+ import time
1
3
  from typing import List, Optional, Union
2
4
 
3
5
  import torch
@@ -12,8 +14,18 @@ from sglang.srt.model_executor.forward_batch_info import (
12
14
  )
13
15
  from sglang.srt.model_executor.model_runner import ModelRunner
14
16
  from sglang.srt.server_args import ServerArgs
15
- from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
16
- from sglang.srt.utils import rank0_print
17
+ from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
18
+ EAGLEDraftCudaGraphRunner,
19
+ )
20
+ from sglang.srt.speculative.eagle_utils import (
21
+ EagleDraftInput,
22
+ EagleVerifyInput,
23
+ assign_draft_cache_locs,
24
+ fast_topk,
25
+ select_top_k_tokens,
26
+ )
27
+
28
+ logger = logging.getLogger(__name__)
17
29
 
18
30
 
19
31
  class EAGLEWorker(TpModelWorker):
@@ -40,41 +52,47 @@ class EAGLEWorker(TpModelWorker):
40
52
  is_draft_worker=True,
41
53
  )
42
54
  self.target_worker = target_worker
43
- self.server_args = server_args
44
55
  self.finish_extend_len = []
45
56
 
57
+ # Parse arguments
58
+ self.topk = server_args.speculative_eagle_topk
59
+ self.speculative_num_steps = server_args.speculative_num_steps
60
+ self.server_args = server_args
61
+
46
62
  # Share the embedding and lm_head
47
63
  embed, head = self.target_worker.model_runner.model.get_embed_and_head()
48
64
  self.model_runner.model.set_embed_and_head(embed, head)
49
65
  self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
50
- self.model_runner.init_cuda_graphs()
51
66
 
52
- def forward_draft_decode(self, batch: ScheduleBatch):
53
- batch.spec_info.prepare_for_decode(batch)
54
- batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
55
- model_worker_batch = batch.get_model_worker_batch()
56
- forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
57
- logits_output = self.model_runner.forward(forward_batch)
58
- self.capture_for_decode(logits_output, forward_batch)
67
+ # Create multi-step attn backends and cuda graph runners
68
+ from sglang.srt.layers.attention.flashinfer_backend import (
69
+ FlashInferMultiStepDraftBackend,
70
+ )
59
71
 
60
- def forward_draft_extend(self, batch: ScheduleBatch):
61
- self._set_mem_pool(batch, self.model_runner)
62
- batch.spec_info.prepare_for_extend(batch)
63
- batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
64
- model_worker_batch = batch.get_model_worker_batch()
65
- forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
66
- logits_output = self.model_runner.forward(forward_batch)
67
- self.capture_for_decode(logits_output, forward_batch)
68
- self._set_mem_pool(batch, self.target_worker.model_runner)
72
+ self.draft_attn_backend = FlashInferMultiStepDraftBackend(
73
+ self.model_runner,
74
+ self.topk,
75
+ self.speculative_num_steps,
76
+ )
77
+ self.model_runner.draft_attn_backend = self.draft_attn_backend
78
+ self.init_cuda_graphs()
79
+
80
+ def init_cuda_graphs(self):
81
+ """Capture cuda graphs."""
82
+ self.cuda_graph_runner = None
83
+
84
+ if self.server_args.disable_cuda_graph:
85
+ return
86
+
87
+ tic = time.time()
88
+ logger.info("Capture cuda graph begin. This can take up to several minutes.")
89
+ self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
90
+ logger.info(f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s")
69
91
 
70
92
  def forward_batch_speculative_generation(self, batch: ScheduleBatch):
71
93
  if batch.forward_mode.is_decode():
72
94
  # Draft
73
- self._set_mem_pool(batch, self.model_runner)
74
- for i in range(self.server_args.speculative_num_steps):
75
- self.forward_draft_decode(batch)
76
- batch.spec_info.clear_draft_cache(batch)
77
- self._set_mem_pool(batch, self.target_worker.model_runner)
95
+ spec_info: EagleVerifyInput = self.draft(batch)
78
96
 
79
97
  # Verify
80
98
  (
@@ -84,8 +102,7 @@ class EAGLEWorker(TpModelWorker):
84
102
  self.finish_extend_len,
85
103
  accept_length_cpu,
86
104
  model_worker_batch,
87
- ) = self.verify(batch)
88
- next_draft_input.load_server_args(self.server_args)
105
+ ) = self.verify(batch, spec_info)
89
106
  batch.spec_info = next_draft_input
90
107
  # if it is None, means all requsets are finished
91
108
  if batch.spec_info.verified_id is not None:
@@ -107,39 +124,156 @@ class EAGLEWorker(TpModelWorker):
107
124
  )
108
125
 
109
126
  # Forward with the draft model.
110
- spec_info = EAGLEDraftInput()
111
- spec_info.load_server_args(self.server_args)
112
- spec_info.hidden_states = logits_output.hidden_states
113
- spec_info.verified_id = next_token_ids
114
- batch.spec_info = spec_info
127
+ batch.spec_info = EagleDraftInput(
128
+ hidden_states=logits_output.hidden_states,
129
+ verified_id=next_token_ids,
130
+ )
115
131
  self.forward_draft_extend(batch)
116
132
  return logits_output, next_token_ids, model_worker_batch, 0
117
133
 
118
- def verify(self, batch: ScheduleBatch):
119
- verify_input = batch.spec_info.prepare_for_verify(batch)
120
- verify_input.prepare_for_verify(batch)
134
+ def draft(self, batch: ScheduleBatch):
135
+ self._set_mem_pool(batch, self.model_runner)
136
+
137
+ # Parse args
138
+ num_seqs = batch.batch_size()
139
+ spec_info = batch.spec_info
140
+
141
+ # Allocate cache locations
142
+ out_cache_loc = batch.alloc_token_slots(
143
+ num_seqs * self.topk * self.speculative_num_steps
144
+ )
145
+ assign_draft_cache_locs[(num_seqs,)](
146
+ batch.req_pool_indices,
147
+ batch.req_to_token_pool.req_to_token,
148
+ batch.seq_lens,
149
+ out_cache_loc,
150
+ batch.req_to_token_pool.req_to_token.shape[1],
151
+ self.topk,
152
+ self.speculative_num_steps,
153
+ )
154
+
155
+ batch.out_cache_loc = out_cache_loc
156
+ batch.seq_lens_sum = torch.sum(batch.seq_lens).item()
157
+ spec_info.positions = batch.seq_lens.repeat_interleave(self.topk, dim=0)
158
+
159
+ # Get forward batch
160
+ spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
161
+ model_worker_batch = batch.get_model_worker_batch()
162
+ forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
163
+ can_cuda_graph = self.cuda_graph_runner and self.cuda_graph_runner.can_run(
164
+ forward_batch
165
+ )
166
+
167
+ if can_cuda_graph:
168
+ score_list, token_list, parents_list = self.cuda_graph_runner.replay(
169
+ forward_batch
170
+ )
171
+ else:
172
+ # Initialize attention backend
173
+ self.draft_attn_backend.init_forward_metadata(forward_batch)
174
+
175
+ # Run forward steps
176
+ score_list, token_list, parents_list = self.draft_forward(forward_batch)
177
+
178
+ ret = EagleVerifyInput.create(
179
+ spec_info.verified_id,
180
+ score_list,
181
+ token_list,
182
+ parents_list,
183
+ batch.seq_lens,
184
+ batch.seq_lens_sum,
185
+ self.topk,
186
+ self.speculative_num_steps,
187
+ self.server_args.speculative_num_draft_tokens,
188
+ )
189
+
190
+ # Free cache locations
191
+ batch.token_to_kv_pool.free(out_cache_loc)
192
+ self._set_mem_pool(batch, self.target_worker.model_runner)
193
+ return ret
194
+
195
+ def draft_forward(self, forward_batch: ForwardBatch):
196
+ # Parse args
197
+ spec_info = forward_batch.spec_info
198
+ out_cache_loc = forward_batch.out_cache_loc
199
+ topk_p, topk_index, hidden_states = (
200
+ spec_info.topk_p,
201
+ spec_info.topk_index,
202
+ spec_info.hidden_states,
203
+ )
204
+
205
+ # Return values
206
+ score_list: List[torch.Tensor] = []
207
+ token_list: List[torch.Tensor] = []
208
+ parents_list: List[torch.Tensor] = []
209
+
210
+ # Forward multiple steps
211
+ scores = None
212
+ for i in range(self.speculative_num_steps):
213
+ input_ids, hidden_states, scores, tree_info = select_top_k_tokens(
214
+ i, topk_p, topk_index, hidden_states, scores, self.topk
215
+ )
216
+ score_list.append(tree_info[0])
217
+ token_list.append(tree_info[1])
218
+ parents_list.append(tree_info[2])
219
+
220
+ # Set inputs
221
+ forward_batch.input_ids = input_ids
222
+ forward_batch.out_cache_loc = out_cache_loc[
223
+ forward_batch.batch_size
224
+ * self.topk
225
+ * i : forward_batch.batch_size
226
+ * self.topk
227
+ * (i + 1)
228
+ ]
229
+ forward_batch.positions.add_(1)
230
+ forward_batch.attn_backend = self.draft_attn_backend.attn_backends[i]
231
+ spec_info.hidden_states = hidden_states
232
+
233
+ # Run forward
234
+ logits_output = self.model_runner.model.forward(
235
+ forward_batch.input_ids, forward_batch.positions, forward_batch
236
+ )
237
+ probs = torch.softmax(logits_output.next_token_logits, dim=-1)
238
+ topk_p, topk_index = fast_topk(probs, self.topk, dim=-1)
239
+ hidden_states = logits_output.hidden_states
240
+
241
+ return score_list, token_list, parents_list
242
+
243
+ def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
244
+ spec_info.prepare_for_verify(batch)
121
245
  batch.forward_mode = ForwardMode.TARGET_VERIFY
122
- batch.spec_info = verify_input
123
- batch.spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
246
+ batch.spec_info = spec_info
124
247
  model_worker_batch = batch.get_model_worker_batch()
125
248
  logits_output, _ = self.target_worker.forward_batch_generation(
126
249
  model_worker_batch, skip_sample=True
127
250
  )
128
- verify_input.hidden_states = logits_output.hidden_states
129
- res = verify_input.verify(batch, logits_output)
251
+ spec_info.hidden_states = logits_output.hidden_states
252
+ res = spec_info.verify(batch, logits_output)
130
253
  batch.forward_mode = ForwardMode.DECODE
131
254
  return res + (model_worker_batch,)
132
255
 
256
+ def forward_draft_extend(self, batch: ScheduleBatch):
257
+ self._set_mem_pool(batch, self.model_runner)
258
+ batch.spec_info.prepare_for_extend(batch)
259
+ batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
260
+ model_worker_batch = batch.get_model_worker_batch()
261
+ forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
262
+ logits_output = self.model_runner.forward(forward_batch)
263
+ self.capture_for_decode(logits_output, forward_batch)
264
+ self._set_mem_pool(batch, self.target_worker.model_runner)
265
+
133
266
  def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
134
267
  batch.token_to_kv_pool = runner.token_to_kv_pool
135
268
  batch.req_to_token_pool = runner.req_to_token_pool
136
269
 
137
270
  def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
138
271
  seq_lens_backup = batch.seq_lens
272
+ req_pool_indices_backup = batch.req_pool_indices
139
273
 
140
274
  self._set_mem_pool(batch, self.model_runner)
141
275
  batch.forward_mode = ForwardMode.DRAFT_EXTEND
142
- batch.spec_info.prepare_extend_after_decode(batch)
276
+ batch.spec_info.prepare_extend_after_decode(batch, self.speculative_num_steps)
143
277
  batch.spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
144
278
  model_worker_batch = batch.get_model_worker_batch()
145
279
  forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
@@ -151,17 +285,15 @@ class EAGLEWorker(TpModelWorker):
151
285
  # This is because `seq_lens` can be modified in `prepare_extend_after_decode`
152
286
  batch.forward_mode = ForwardMode.DECODE
153
287
  batch.seq_lens = seq_lens_backup
288
+ batch.req_pool_indices = req_pool_indices_backup
154
289
 
155
290
  def capture_for_decode(
156
291
  self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
157
292
  ):
158
- sample_output = torch.softmax(
159
- logits_output.next_token_logits, dim=-1
160
- ) # TODO(kavioyu): Support more sampling methods
293
+ probs = torch.softmax(logits_output.next_token_logits, dim=-1)
161
294
  spec_info = forward_batch.spec_info
162
- spec_info.sample_output = sample_output
295
+ spec_info.topk_p, spec_info.topk_index = fast_topk(probs, self.topk, dim=-1)
163
296
  spec_info.hidden_states = logits_output.hidden_states
164
- spec_info.prev_mode = forward_batch.forward_mode
165
297
 
166
298
  # Don't support prefix share now.
167
299
  def finish_request(self, reqs: Union[Req, List[Req]]):
sglang/srt/utils.py CHANGED
@@ -444,8 +444,6 @@ def load_image(image_file: Union[str, bytes]):
444
444
  else:
445
445
  raise ValueError(f"Invalid image: {image}")
446
446
 
447
- # if image_size is None:
448
- # image_size = image.size
449
447
  return image, image_size
450
448
 
451
449
 
@@ -1048,6 +1046,13 @@ def get_device_name(device_id: int = 0) -> str:
1048
1046
  return torch.hpu.get_device_name(device_id)
1049
1047
 
1050
1048
 
1049
+ def get_device_core_count(device_id: int = 0) -> int:
1050
+ if hasattr(torch, "cuda") and torch.cuda.is_available():
1051
+ return torch.cuda.get_device_properties(device_id).multi_processor_count
1052
+
1053
+ return 0
1054
+
1055
+
1051
1056
  def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
1052
1057
  major, minor = None, None
1053
1058
  if hasattr(torch, "cuda") and torch.cuda.is_available():
sglang/test/runners.py CHANGED
@@ -272,6 +272,7 @@ class SRTRunner:
272
272
  port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
273
273
  lora_paths: List[str] = None,
274
274
  max_loras_per_batch: int = 4,
275
+ lora_backend: str = "triton",
275
276
  disable_cuda_graph: bool = False,
276
277
  disable_radix_cache: bool = False,
277
278
  ):
@@ -287,6 +288,7 @@ class SRTRunner:
287
288
  is_embedding=not self.is_generation,
288
289
  lora_paths=lora_paths,
289
290
  max_loras_per_batch=max_loras_per_batch,
291
+ lora_backend=lora_backend,
290
292
  disable_cuda_graph=disable_cuda_graph,
291
293
  disable_radix_cache=disable_radix_cache,
292
294
  )
sglang/utils.py CHANGED
@@ -373,3 +373,45 @@ class TypeBasedDispatcher:
373
373
  if isinstance(obj, ty):
374
374
  return fn(obj)
375
375
  raise ValueError(f"Invalid object: {obj}")
376
+
377
+
378
+ def trim_overlap(existing_text, new_chunk):
379
+ """
380
+ Finds the largest suffix of 'existing_text' that is a prefix of 'new_chunk'
381
+ and removes that overlap from the start of 'new_chunk'.
382
+ """
383
+ max_overlap = 0
384
+ max_possible = min(len(existing_text), len(new_chunk))
385
+ for i in range(max_possible, 0, -1):
386
+ if existing_text.endswith(new_chunk[:i]):
387
+ max_overlap = i
388
+ break
389
+ return new_chunk[max_overlap:]
390
+
391
+
392
+ def stream_and_merge(llm, prompt, sampling_params):
393
+ """
394
+ 1) Streams the text,
395
+ 2) Removes chunk overlaps,
396
+ 3) Returns the merged text.
397
+ """
398
+ final_text = ""
399
+ for chunk in llm.generate(prompt, sampling_params, stream=True):
400
+ chunk_text = chunk["text"]
401
+ cleaned_chunk = trim_overlap(final_text, chunk_text)
402
+ final_text += cleaned_chunk
403
+ return final_text
404
+
405
+
406
+ async def async_stream_and_merge(llm, prompt, sampling_params):
407
+ """
408
+ Streams tokens asynchronously, removes chunk overlaps,
409
+ and yields the cleaned chunk in real time for printing.
410
+ """
411
+ final_text = ""
412
+ generator = await llm.async_generate(prompt, sampling_params, stream=True)
413
+ async for chunk in generator:
414
+ chunk_text = chunk["text"]
415
+ cleaned_chunk = trim_overlap(final_text, chunk_text)
416
+ final_text += cleaned_chunk
417
+ yield cleaned_chunk # yield the non-overlapping portion
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.2"
1
+ __version__ = "0.4.2.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.2
3
+ Version: 0.4.2.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -225,7 +225,6 @@ Requires-Dist: huggingface_hub; extra == "runtime-common"
225
225
  Requires-Dist: interegular; extra == "runtime-common"
226
226
  Requires-Dist: modelscope; extra == "runtime-common"
227
227
  Requires-Dist: orjson; extra == "runtime-common"
228
- Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
229
228
  Requires-Dist: packaging; extra == "runtime-common"
230
229
  Requires-Dist: pillow; extra == "runtime-common"
231
230
  Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
@@ -240,21 +239,27 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
240
239
  Provides-Extra: srt
241
240
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
241
  Requires-Dist: cuda-python; extra == "srt"
243
- Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
242
+ Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt"
244
243
  Requires-Dist: torch; extra == "srt"
245
244
  Requires-Dist: vllm==0.6.4.post1; extra == "srt"
246
- Requires-Dist: flashinfer==0.1.6; extra == "srt"
245
+ Requires-Dist: flashinfer_python>=0.2.0.post2; extra == "srt"
246
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt"
247
247
  Provides-Extra: srt-hip
248
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
249
249
  Requires-Dist: torch; extra == "srt-hip"
250
- Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
250
+ Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
251
+ Requires-Dist: outlines==0.1.11; extra == "srt-hip"
252
+ Requires-Dist: sgl-kernel>=0.0.3.post1; extra == "srt-hip"
251
253
  Provides-Extra: srt-xpu
252
254
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
255
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-xpu"
253
256
  Provides-Extra: srt-hpu
254
257
  Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
258
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-hpu"
255
259
  Provides-Extra: srt-cpu
256
260
  Requires-Dist: sglang[runtime_common]; extra == "srt-cpu"
257
261
  Requires-Dist: torch; extra == "srt-cpu"
262
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "srt-cpu"
258
263
  Provides-Extra: openai
259
264
  Requires-Dist: openai>=1.0; extra == "openai"
260
265
  Requires-Dist: tiktoken; extra == "openai"
@@ -333,7 +338,7 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
333
338
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
334
339
 
335
340
  ## News
336
- - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
341
+ - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
337
342
  - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
338
343
  - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
339
344
  - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
@@ -372,7 +377,11 @@ Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
372
377
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
373
378
 
374
379
  ## Adoption and Sponsorship
375
- The project is supported by (alphabetically): AMD, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
380
+ The project is supported by (alphabetically): AMD, Atlas Cloud, Baseten, Cursor, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS CORP, Meituan, Nebius, Novita AI, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
381
+
382
+ ## Contact Us
383
+
384
+ For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
376
385
 
377
386
  ## Acknowledgment and Citation
378
387
  We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.