sglang 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +23 -1
- sglang/bench_latency.py +46 -25
- sglang/bench_serving.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +14 -1
- sglang/lang/interpreter.py +16 -6
- sglang/lang/ir.py +20 -4
- sglang/srt/configs/model_config.py +11 -9
- sglang/srt/constrained/fsm_cache.py +9 -1
- sglang/srt/constrained/jump_forward.py +15 -2
- sglang/srt/layers/activation.py +4 -4
- sglang/srt/layers/attention/__init__.py +49 -0
- sglang/srt/layers/attention/flashinfer_backend.py +277 -0
- sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
- sglang/srt/layers/attention/triton_backend.py +161 -0
- sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
- sglang/srt/layers/layernorm.py +4 -4
- sglang/srt/layers/logits_processor.py +19 -15
- sglang/srt/layers/pooler.py +3 -3
- sglang/srt/layers/quantization/__init__.py +0 -2
- sglang/srt/layers/radix_attention.py +6 -4
- sglang/srt/layers/sampler.py +6 -4
- sglang/srt/layers/torchao_utils.py +18 -0
- sglang/srt/lora/lora.py +20 -21
- sglang/srt/lora/lora_manager.py +97 -25
- sglang/srt/managers/detokenizer_manager.py +31 -18
- sglang/srt/managers/image_processor.py +187 -0
- sglang/srt/managers/io_struct.py +99 -75
- sglang/srt/managers/schedule_batch.py +184 -63
- sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
- sglang/srt/managers/scheduler.py +1021 -0
- sglang/srt/managers/tokenizer_manager.py +120 -248
- sglang/srt/managers/tp_worker.py +28 -925
- sglang/srt/mem_cache/memory_pool.py +34 -52
- sglang/srt/model_executor/cuda_graph_runner.py +15 -19
- sglang/srt/model_executor/forward_batch_info.py +94 -95
- sglang/srt/model_executor/model_runner.py +76 -75
- sglang/srt/models/baichuan.py +10 -10
- sglang/srt/models/chatglm.py +12 -12
- sglang/srt/models/commandr.py +10 -10
- sglang/srt/models/dbrx.py +12 -12
- sglang/srt/models/deepseek.py +10 -10
- sglang/srt/models/deepseek_v2.py +14 -15
- sglang/srt/models/exaone.py +10 -10
- sglang/srt/models/gemma.py +10 -10
- sglang/srt/models/gemma2.py +11 -11
- sglang/srt/models/gpt_bigcode.py +10 -10
- sglang/srt/models/grok.py +10 -10
- sglang/srt/models/internlm2.py +10 -10
- sglang/srt/models/llama.py +14 -10
- sglang/srt/models/llama_classification.py +5 -5
- sglang/srt/models/llama_embedding.py +4 -4
- sglang/srt/models/llama_reward.py +142 -0
- sglang/srt/models/llava.py +39 -33
- sglang/srt/models/llavavid.py +31 -28
- sglang/srt/models/minicpm.py +10 -10
- sglang/srt/models/minicpm3.py +14 -15
- sglang/srt/models/mixtral.py +10 -10
- sglang/srt/models/mixtral_quant.py +10 -10
- sglang/srt/models/olmoe.py +10 -10
- sglang/srt/models/qwen.py +10 -10
- sglang/srt/models/qwen2.py +11 -11
- sglang/srt/models/qwen2_moe.py +10 -10
- sglang/srt/models/stablelm.py +10 -10
- sglang/srt/models/torch_native_llama.py +506 -0
- sglang/srt/models/xverse.py +10 -10
- sglang/srt/models/xverse_moe.py +10 -10
- sglang/srt/sampling/sampling_batch_info.py +36 -27
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +170 -119
- sglang/srt/server_args.py +54 -27
- sglang/srt/utils.py +101 -128
- sglang/test/runners.py +71 -26
- sglang/test/test_programs.py +38 -5
- sglang/test/test_utils.py +18 -9
- sglang/version.py +1 -1
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/METADATA +37 -19
- sglang-0.3.3.dist-info/RECORD +139 -0
- sglang/srt/layers/attention_backend.py +0 -474
- sglang/srt/managers/controller_multi.py +0 -207
- sglang/srt/managers/controller_single.py +0 -164
- sglang-0.3.2.dist-info/RECORD +0 -135
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
- /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
- {sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
|
|
13
13
|
limitations under the License.
|
14
14
|
"""
|
15
15
|
|
16
|
-
"""Request policy
|
16
|
+
"""Request scheduler policy"""
|
17
17
|
|
18
18
|
import os
|
19
19
|
import random
|
20
20
|
from collections import defaultdict
|
21
21
|
from contextlib import contextmanager
|
22
|
+
from enum import Enum, auto
|
22
23
|
from typing import Dict, List, Optional
|
23
24
|
|
24
25
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
@@ -32,7 +33,7 @@ from sglang.srt.mem_cache.radix_cache import TreeNode
|
|
32
33
|
CLIP_MAX_NEW_TOKENS = int(os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS", "4096"))
|
33
34
|
|
34
35
|
|
35
|
-
class
|
36
|
+
class SchedulePolicy:
|
36
37
|
def __init__(self, policy: str, tree_cache: BasePrefixCache):
|
37
38
|
if tree_cache.disable and policy in ["lpm", "dfs-weight"]:
|
38
39
|
# LPM and DFS-weight is meaningless when the tree cache is disabled.
|
@@ -104,6 +105,12 @@ class PolicyScheduler:
|
|
104
105
|
q.extend(last_node_to_reqs[cur_node])
|
105
106
|
|
106
107
|
|
108
|
+
class AddReqResult(Enum):
|
109
|
+
CONTINUE = auto() # Continue to add requests
|
110
|
+
NO_TOKEN = auto() # No token left
|
111
|
+
OTHER = auto() # Other reasons to stop adding requests
|
112
|
+
|
113
|
+
|
107
114
|
class PrefillAdder:
|
108
115
|
def __init__(
|
109
116
|
self,
|
@@ -145,17 +152,16 @@ class PrefillAdder:
|
|
145
152
|
]
|
146
153
|
)
|
147
154
|
|
148
|
-
def
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
)
|
155
|
+
def budget_state(self):
|
156
|
+
if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
|
157
|
+
return AddReqResult.NO_TOKEN
|
158
|
+
|
159
|
+
if self.rem_input_tokens <= 0 or (
|
160
|
+
self.rem_chunk_tokens is not None and self.rem_chunk_tokens <= 0
|
161
|
+
):
|
162
|
+
return AddReqResult.OTHER
|
163
|
+
|
164
|
+
return AddReqResult.CONTINUE
|
159
165
|
|
160
166
|
def _prefill_one_req(
|
161
167
|
self, prefix_len: int, extend_input_len: int, max_new_tokens: int
|
@@ -212,6 +218,7 @@ class PrefillAdder:
|
|
212
218
|
if not insert_sort:
|
213
219
|
self.req_states.append((tokens_left, tokens_occupied))
|
214
220
|
else:
|
221
|
+
i = 0
|
215
222
|
for i in range(len(self.req_states)):
|
216
223
|
if tokens_left <= self.req_states[i][0]:
|
217
224
|
break
|
@@ -239,10 +246,13 @@ class PrefillAdder:
|
|
239
246
|
)
|
240
247
|
bs = len(self.req_states) - i
|
241
248
|
if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
|
242
|
-
return
|
249
|
+
return AddReqResult.NO_TOKEN
|
243
250
|
tokens_freed += tokens_occupied
|
244
251
|
|
245
|
-
if
|
252
|
+
if (
|
253
|
+
self.rem_chunk_tokens is None
|
254
|
+
or req.extend_input_len <= self.rem_chunk_tokens
|
255
|
+
):
|
246
256
|
self.can_run_list.append(req)
|
247
257
|
self._prefill_one_req(
|
248
258
|
0,
|
@@ -258,7 +268,7 @@ class PrefillAdder:
|
|
258
268
|
self.new_inflight_req = req
|
259
269
|
self._prefill_one_req(0, trunc_len, 0)
|
260
270
|
|
261
|
-
return
|
271
|
+
return self.budget_state()
|
262
272
|
|
263
273
|
def add_one_req(self, req: Req):
|
264
274
|
if req.sampling_params.ignore_eos and self.tree_cache.disable:
|
@@ -271,14 +281,14 @@ class PrefillAdder:
|
|
271
281
|
prefix_len = len(req.prefix_indices)
|
272
282
|
|
273
283
|
if total_tokens >= self.rem_total_tokens:
|
274
|
-
return
|
284
|
+
return AddReqResult.NO_TOKEN
|
275
285
|
|
276
286
|
if input_tokens > self.rem_input_tokens and len(self.can_run_list) != 0:
|
277
|
-
return
|
287
|
+
return AddReqResult.OTHER
|
278
288
|
|
279
289
|
with self._lock_node(req.last_node):
|
280
290
|
if total_tokens > self.rem_total_tokens:
|
281
|
-
return
|
291
|
+
return AddReqResult.NO_TOKEN
|
282
292
|
|
283
293
|
if (
|
284
294
|
self.rem_chunk_tokens is None
|
@@ -297,7 +307,7 @@ class PrefillAdder:
|
|
297
307
|
# Chunked prefill
|
298
308
|
trunc_len = self.rem_chunk_tokens
|
299
309
|
if trunc_len == 0:
|
300
|
-
return
|
310
|
+
return AddReqResult.OTHER
|
301
311
|
|
302
312
|
req.extend_input_len = trunc_len
|
303
313
|
req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
|
@@ -306,4 +316,4 @@ class PrefillAdder:
|
|
306
316
|
self.tree_cache.inc_lock_ref(req.last_node)
|
307
317
|
self._prefill_one_req(prefix_len, trunc_len, 0)
|
308
318
|
|
309
|
-
return
|
319
|
+
return self.budget_state()
|