sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +59 -2
- sglang/api.py +40 -11
- sglang/backend/anthropic.py +17 -3
- sglang/backend/litellm.py +90 -0
- sglang/backend/openai.py +160 -12
- sglang/backend/runtime_endpoint.py +62 -27
- sglang/backend/vertexai.py +1 -0
- sglang/bench_latency.py +320 -0
- sglang/global_config.py +24 -3
- sglang/lang/chat_template.py +122 -6
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +206 -98
- sglang/lang/ir.py +98 -34
- sglang/lang/tracer.py +6 -4
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +32 -0
- sglang/srt/constrained/__init__.py +14 -6
- sglang/srt/constrained/fsm_cache.py +9 -2
- sglang/srt/constrained/jump_forward.py +113 -24
- sglang/srt/conversation.py +4 -2
- sglang/srt/flush_cache.py +18 -0
- sglang/srt/hf_transformers_utils.py +144 -3
- sglang/srt/layers/context_flashattention_nopad.py +1 -0
- sglang/srt/layers/extend_attention.py +20 -1
- sglang/srt/layers/fused_moe.py +596 -0
- sglang/srt/layers/logits_processor.py +190 -61
- sglang/srt/layers/radix_attention.py +62 -53
- sglang/srt/layers/token_attention.py +21 -9
- sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
- sglang/srt/managers/controller/dp_worker.py +113 -0
- sglang/srt/managers/controller/infer_batch.py +908 -0
- sglang/srt/managers/controller/manager_multi.py +195 -0
- sglang/srt/managers/controller/manager_single.py +177 -0
- sglang/srt/managers/controller/model_runner.py +359 -0
- sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
- sglang/srt/managers/controller/schedule_heuristic.py +65 -0
- sglang/srt/managers/controller/tp_worker.py +813 -0
- sglang/srt/managers/detokenizer_manager.py +42 -40
- sglang/srt/managers/io_struct.py +44 -10
- sglang/srt/managers/tokenizer_manager.py +224 -82
- sglang/srt/memory_pool.py +52 -59
- sglang/srt/model_config.py +97 -2
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +369 -0
- sglang/srt/models/dbrx.py +406 -0
- sglang/srt/models/gemma.py +34 -38
- sglang/srt/models/gemma2.py +436 -0
- sglang/srt/models/grok.py +738 -0
- sglang/srt/models/llama2.py +47 -37
- sglang/srt/models/llama_classification.py +107 -0
- sglang/srt/models/llava.py +92 -27
- sglang/srt/models/llavavid.py +298 -0
- sglang/srt/models/minicpm.py +366 -0
- sglang/srt/models/mixtral.py +302 -127
- sglang/srt/models/mixtral_quant.py +372 -0
- sglang/srt/models/qwen.py +40 -35
- sglang/srt/models/qwen2.py +33 -36
- sglang/srt/models/qwen2_moe.py +473 -0
- sglang/srt/models/stablelm.py +33 -39
- sglang/srt/models/yivl.py +19 -26
- sglang/srt/openai_api_adapter.py +411 -0
- sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +197 -481
- sglang/srt/server_args.py +190 -74
- sglang/srt/utils.py +460 -95
- sglang/test/test_programs.py +73 -10
- sglang/test/test_utils.py +226 -7
- sglang/utils.py +97 -27
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
- sglang-0.1.21.dist-info/RECORD +82 -0
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
- sglang/srt/backend_config.py +0 -13
- sglang/srt/managers/router/infer_batch.py +0 -503
- sglang/srt/managers/router/manager.py +0 -79
- sglang/srt/managers/router/model_rpc.py +0 -686
- sglang/srt/managers/router/model_runner.py +0 -514
- sglang/srt/managers/router/scheduler.py +0 -70
- sglang-0.1.14.dist-info/RECORD +0 -64
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
"""Request scheduler heuristic."""
|
2
|
+
|
3
|
+
import random
|
4
|
+
from collections import defaultdict
|
5
|
+
|
6
|
+
|
7
|
+
class ScheduleHeuristic:
|
8
|
+
def __init__(
|
9
|
+
self,
|
10
|
+
schedule_heuristic,
|
11
|
+
max_running_seqs,
|
12
|
+
max_prefill_num_tokens,
|
13
|
+
max_total_num_tokens,
|
14
|
+
tree_cache,
|
15
|
+
):
|
16
|
+
if tree_cache.disable and schedule_heuristic == "lpm":
|
17
|
+
# LMP is not meaningless when tree cache is disabled.
|
18
|
+
schedule_heuristic = "fcfs"
|
19
|
+
|
20
|
+
self.schedule_heuristic = schedule_heuristic
|
21
|
+
self.max_running_seqs = max_running_seqs
|
22
|
+
self.max_prefill_num_tokens = max_prefill_num_tokens
|
23
|
+
self.max_total_num_tokens = max_total_num_tokens
|
24
|
+
self.tree_cache = tree_cache
|
25
|
+
|
26
|
+
def get_priority_queue(self, forward_queue):
|
27
|
+
if self.schedule_heuristic == "lpm":
|
28
|
+
# longest prefix match
|
29
|
+
forward_queue.sort(key=lambda x: -len(x.prefix_indices))
|
30
|
+
return forward_queue
|
31
|
+
elif self.schedule_heuristic == "random":
|
32
|
+
random.shuffle(forward_queue)
|
33
|
+
return forward_queue
|
34
|
+
elif self.schedule_heuristic == "fcfs":
|
35
|
+
return forward_queue
|
36
|
+
elif self.schedule_heuristic == "dfs-weight":
|
37
|
+
last_node_to_reqs = defaultdict(list)
|
38
|
+
for req in forward_queue:
|
39
|
+
last_node_to_reqs[req.last_node].append(req)
|
40
|
+
|
41
|
+
node_to_weight = defaultdict(int)
|
42
|
+
for node in last_node_to_reqs:
|
43
|
+
node_to_weight[node] = len(last_node_to_reqs[node])
|
44
|
+
self.calc_weight(self.tree_cache.root_node, node_to_weight)
|
45
|
+
|
46
|
+
q = []
|
47
|
+
self.get_dfs_priority(
|
48
|
+
self.tree_cache.root_node, node_to_weight, last_node_to_reqs, q
|
49
|
+
)
|
50
|
+
assert len(q) == len(forward_queue)
|
51
|
+
return q
|
52
|
+
else:
|
53
|
+
raise ValueError(f"Unknown schedule_heuristic: {self.schedule_heuristic}")
|
54
|
+
|
55
|
+
def calc_weight(self, cur_node, node_to_weight):
|
56
|
+
for child in cur_node.children.values():
|
57
|
+
self.calc_weight(child, node_to_weight)
|
58
|
+
node_to_weight[cur_node] += node_to_weight[child]
|
59
|
+
|
60
|
+
def get_dfs_priority(self, cur_node, node_to_priority, last_node_to_reqs, q):
|
61
|
+
childs = [child for child in cur_node.children.values()]
|
62
|
+
childs.sort(key=lambda x: -node_to_priority[x])
|
63
|
+
for child in childs:
|
64
|
+
self.get_dfs_priority(child, node_to_priority, last_node_to_reqs, q)
|
65
|
+
q.extend(last_node_to_reqs[cur_node])
|