sglang 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +23 -1
  3. sglang/bench_latency.py +46 -25
  4. sglang/bench_serving.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +14 -1
  6. sglang/lang/interpreter.py +16 -6
  7. sglang/lang/ir.py +20 -4
  8. sglang/srt/configs/model_config.py +11 -9
  9. sglang/srt/constrained/fsm_cache.py +9 -1
  10. sglang/srt/constrained/jump_forward.py +15 -2
  11. sglang/srt/layers/activation.py +4 -4
  12. sglang/srt/layers/attention/__init__.py +49 -0
  13. sglang/srt/layers/attention/flashinfer_backend.py +277 -0
  14. sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
  15. sglang/srt/layers/attention/triton_backend.py +161 -0
  16. sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
  17. sglang/srt/layers/layernorm.py +4 -4
  18. sglang/srt/layers/logits_processor.py +19 -15
  19. sglang/srt/layers/pooler.py +3 -3
  20. sglang/srt/layers/quantization/__init__.py +0 -2
  21. sglang/srt/layers/radix_attention.py +6 -4
  22. sglang/srt/layers/sampler.py +6 -4
  23. sglang/srt/layers/torchao_utils.py +18 -0
  24. sglang/srt/lora/lora.py +20 -21
  25. sglang/srt/lora/lora_manager.py +97 -25
  26. sglang/srt/managers/detokenizer_manager.py +31 -18
  27. sglang/srt/managers/image_processor.py +187 -0
  28. sglang/srt/managers/io_struct.py +99 -75
  29. sglang/srt/managers/schedule_batch.py +184 -63
  30. sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
  31. sglang/srt/managers/scheduler.py +1021 -0
  32. sglang/srt/managers/tokenizer_manager.py +120 -248
  33. sglang/srt/managers/tp_worker.py +28 -925
  34. sglang/srt/mem_cache/memory_pool.py +34 -52
  35. sglang/srt/model_executor/cuda_graph_runner.py +15 -19
  36. sglang/srt/model_executor/forward_batch_info.py +94 -95
  37. sglang/srt/model_executor/model_runner.py +76 -75
  38. sglang/srt/models/baichuan.py +10 -10
  39. sglang/srt/models/chatglm.py +12 -12
  40. sglang/srt/models/commandr.py +10 -10
  41. sglang/srt/models/dbrx.py +12 -12
  42. sglang/srt/models/deepseek.py +10 -10
  43. sglang/srt/models/deepseek_v2.py +14 -15
  44. sglang/srt/models/exaone.py +10 -10
  45. sglang/srt/models/gemma.py +10 -10
  46. sglang/srt/models/gemma2.py +11 -11
  47. sglang/srt/models/gpt_bigcode.py +10 -10
  48. sglang/srt/models/grok.py +10 -10
  49. sglang/srt/models/internlm2.py +10 -10
  50. sglang/srt/models/llama.py +14 -10
  51. sglang/srt/models/llama_classification.py +5 -5
  52. sglang/srt/models/llama_embedding.py +4 -4
  53. sglang/srt/models/llama_reward.py +142 -0
  54. sglang/srt/models/llava.py +39 -33
  55. sglang/srt/models/llavavid.py +31 -28
  56. sglang/srt/models/minicpm.py +10 -10
  57. sglang/srt/models/minicpm3.py +14 -15
  58. sglang/srt/models/mixtral.py +10 -10
  59. sglang/srt/models/mixtral_quant.py +10 -10
  60. sglang/srt/models/olmoe.py +10 -10
  61. sglang/srt/models/qwen.py +10 -10
  62. sglang/srt/models/qwen2.py +11 -11
  63. sglang/srt/models/qwen2_moe.py +10 -10
  64. sglang/srt/models/stablelm.py +10 -10
  65. sglang/srt/models/torch_native_llama.py +506 -0
  66. sglang/srt/models/xverse.py +10 -10
  67. sglang/srt/models/xverse_moe.py +10 -10
  68. sglang/srt/sampling/sampling_batch_info.py +36 -27
  69. sglang/srt/sampling/sampling_params.py +3 -1
  70. sglang/srt/server.py +170 -119
  71. sglang/srt/server_args.py +54 -27
  72. sglang/srt/utils.py +101 -128
  73. sglang/test/runners.py +71 -26
  74. sglang/test/test_programs.py +38 -5
  75. sglang/test/test_utils.py +18 -9
  76. sglang/version.py +1 -1
  77. {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/METADATA +37 -19
  78. sglang-0.3.3.dist-info/RECORD +139 -0
  79. sglang/srt/layers/attention_backend.py +0 -474
  80. sglang/srt/managers/controller_multi.py +0 -207
  81. sglang/srt/managers/controller_single.py +0 -164
  82. sglang-0.3.2.dist-info/RECORD +0 -135
  83. /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
  84. /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
  85. {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
  86. {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
  87. {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
13
13
  limitations under the License.
14
14
  """
15
15
 
16
- """Request policy scheduler"""
16
+ """Request scheduler policy"""
17
17
 
18
18
  import os
19
19
  import random
20
20
  from collections import defaultdict
21
21
  from contextlib import contextmanager
22
+ from enum import Enum, auto
22
23
  from typing import Dict, List, Optional
23
24
 
24
25
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
@@ -32,7 +33,7 @@ from sglang.srt.mem_cache.radix_cache import TreeNode
32
33
  CLIP_MAX_NEW_TOKENS = int(os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS", "4096"))
33
34
 
34
35
 
35
- class PolicyScheduler:
36
+ class SchedulePolicy:
36
37
  def __init__(self, policy: str, tree_cache: BasePrefixCache):
37
38
  if tree_cache.disable and policy in ["lpm", "dfs-weight"]:
38
39
  # LPM and DFS-weight is meaningless when the tree cache is disabled.
@@ -104,6 +105,12 @@ class PolicyScheduler:
104
105
  q.extend(last_node_to_reqs[cur_node])
105
106
 
106
107
 
108
+ class AddReqResult(Enum):
109
+ CONTINUE = auto() # Continue to add requests
110
+ NO_TOKEN = auto() # No token left
111
+ OTHER = auto() # Other reasons to stop adding requests
112
+
113
+
107
114
  class PrefillAdder:
108
115
  def __init__(
109
116
  self,
@@ -145,17 +152,16 @@ class PrefillAdder:
145
152
  ]
146
153
  )
147
154
 
148
- def no_remaining_tokens(self):
149
- return (
150
- self.rem_total_tokens <= 0
151
- or self.rem_input_tokens <= 0
152
- or (
153
- self.rem_chunk_tokens <= 0
154
- if self.rem_chunk_tokens is not None
155
- else False
156
- )
157
- or self.cur_rem_tokens <= 0
158
- )
155
+ def budget_state(self):
156
+ if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
157
+ return AddReqResult.NO_TOKEN
158
+
159
+ if self.rem_input_tokens <= 0 or (
160
+ self.rem_chunk_tokens is not None and self.rem_chunk_tokens <= 0
161
+ ):
162
+ return AddReqResult.OTHER
163
+
164
+ return AddReqResult.CONTINUE
159
165
 
160
166
  def _prefill_one_req(
161
167
  self, prefix_len: int, extend_input_len: int, max_new_tokens: int
@@ -212,6 +218,7 @@ class PrefillAdder:
212
218
  if not insert_sort:
213
219
  self.req_states.append((tokens_left, tokens_occupied))
214
220
  else:
221
+ i = 0
215
222
  for i in range(len(self.req_states)):
216
223
  if tokens_left <= self.req_states[i][0]:
217
224
  break
@@ -239,10 +246,13 @@ class PrefillAdder:
239
246
  )
240
247
  bs = len(self.req_states) - i
241
248
  if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
242
- return False
249
+ return AddReqResult.NO_TOKEN
243
250
  tokens_freed += tokens_occupied
244
251
 
245
- if req.extend_input_len <= self.rem_chunk_tokens:
252
+ if (
253
+ self.rem_chunk_tokens is None
254
+ or req.extend_input_len <= self.rem_chunk_tokens
255
+ ):
246
256
  self.can_run_list.append(req)
247
257
  self._prefill_one_req(
248
258
  0,
@@ -258,7 +268,7 @@ class PrefillAdder:
258
268
  self.new_inflight_req = req
259
269
  self._prefill_one_req(0, trunc_len, 0)
260
270
 
261
- return True
271
+ return self.budget_state()
262
272
 
263
273
  def add_one_req(self, req: Req):
264
274
  if req.sampling_params.ignore_eos and self.tree_cache.disable:
@@ -271,14 +281,14 @@ class PrefillAdder:
271
281
  prefix_len = len(req.prefix_indices)
272
282
 
273
283
  if total_tokens >= self.rem_total_tokens:
274
- return False
284
+ return AddReqResult.NO_TOKEN
275
285
 
276
286
  if input_tokens > self.rem_input_tokens and len(self.can_run_list) != 0:
277
- return False
287
+ return AddReqResult.OTHER
278
288
 
279
289
  with self._lock_node(req.last_node):
280
290
  if total_tokens > self.rem_total_tokens:
281
- return False
291
+ return AddReqResult.NO_TOKEN
282
292
 
283
293
  if (
284
294
  self.rem_chunk_tokens is None
@@ -297,7 +307,7 @@ class PrefillAdder:
297
307
  # Chunked prefill
298
308
  trunc_len = self.rem_chunk_tokens
299
309
  if trunc_len == 0:
300
- return False
310
+ return AddReqResult.OTHER
301
311
 
302
312
  req.extend_input_len = trunc_len
303
313
  req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
@@ -306,4 +316,4 @@ class PrefillAdder:
306
316
  self.tree_cache.inc_lock_ref(req.last_node)
307
317
  self._prefill_one_req(prefix_len, trunc_len, 0)
308
318
 
309
- return True and not self.no_remaining_tokens()
319
+ return self.budget_state()