sglang 0.4.1.post3__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +2 -0
- sglang/bench_serving.py +18 -1
- sglang/lang/interpreter.py +71 -1
- sglang/lang/ir.py +2 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/chatglm.py +78 -0
- sglang/srt/configs/dbrx.py +279 -0
- sglang/srt/configs/model_config.py +1 -1
- sglang/srt/hf_transformers_utils.py +9 -14
- sglang/srt/layers/attention/__init__.py +22 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +0 -52
- sglang/srt/layers/attention/flashinfer_backend.py +215 -83
- sglang/srt/layers/attention/torch_native_backend.py +1 -38
- sglang/srt/layers/attention/triton_backend.py +20 -11
- sglang/srt/layers/attention/triton_ops/decode_attention.py +4 -0
- sglang/srt/layers/linear.py +159 -55
- sglang/srt/layers/logits_processor.py +170 -215
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +198 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -7
- sglang/srt/layers/parameter.py +431 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/fp8.py +3 -3
- sglang/srt/layers/quantization/modelopt_quant.py +174 -0
- sglang/srt/layers/sampler.py +57 -21
- sglang/srt/layers/torchao_utils.py +17 -3
- sglang/srt/layers/vocab_parallel_embedding.py +1 -1
- sglang/srt/managers/cache_controller.py +307 -0
- sglang/srt/managers/data_parallel_controller.py +2 -0
- sglang/srt/managers/io_struct.py +1 -2
- sglang/srt/managers/schedule_batch.py +33 -3
- sglang/srt/managers/schedule_policy.py +159 -90
- sglang/srt/managers/scheduler.py +68 -28
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +27 -21
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/memory_pool.py +206 -1
- sglang/srt/metrics/collector.py +22 -30
- sglang/srt/model_executor/cuda_graph_runner.py +129 -77
- sglang/srt/model_executor/forward_batch_info.py +51 -21
- sglang/srt/model_executor/model_runner.py +72 -64
- sglang/srt/models/chatglm.py +1 -1
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek_v2.py +34 -7
- sglang/srt/models/grok.py +109 -29
- sglang/srt/models/llama.py +9 -2
- sglang/srt/openai_api/adapter.py +0 -17
- sglang/srt/openai_api/protocol.py +3 -3
- sglang/srt/sampling/sampling_batch_info.py +22 -0
- sglang/srt/sampling/sampling_params.py +9 -1
- sglang/srt/server.py +20 -13
- sglang/srt/server_args.py +120 -58
- sglang/srt/speculative/build_eagle_tree.py +347 -0
- sglang/srt/speculative/eagle_utils.py +626 -0
- sglang/srt/speculative/eagle_worker.py +184 -0
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/utils.py +47 -7
- sglang/test/test_programs.py +23 -1
- sglang/test/test_utils.py +36 -7
- sglang/version.py +1 -1
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/METADATA +12 -12
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/RECORD +86 -57
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/WHEEL +1 -1
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,184 @@
|
|
1
|
+
from typing import List, Optional, Union
|
2
|
+
|
3
|
+
import torch
|
4
|
+
|
5
|
+
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
6
|
+
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
7
|
+
from sglang.srt.managers.tp_worker import TpModelWorker
|
8
|
+
from sglang.srt.model_executor.forward_batch_info import (
|
9
|
+
CaptureHiddenMode,
|
10
|
+
ForwardBatch,
|
11
|
+
ForwardMode,
|
12
|
+
)
|
13
|
+
from sglang.srt.model_executor.model_runner import ModelRunner
|
14
|
+
from sglang.srt.server_args import ServerArgs
|
15
|
+
from sglang.srt.speculative.eagle_utils import EAGLEDraftInput
|
16
|
+
|
17
|
+
|
18
|
+
class EAGLEWorker(TpModelWorker):
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
server_args: ServerArgs,
|
23
|
+
gpu_id: int,
|
24
|
+
tp_rank: int,
|
25
|
+
dp_rank: Optional[int],
|
26
|
+
nccl_port: int,
|
27
|
+
target_worker: TpModelWorker,
|
28
|
+
):
|
29
|
+
# Do not capture cuda graph in `super().__init__()`
|
30
|
+
# We will capture it later
|
31
|
+
backup_disable_cuda_graph = server_args.disable_cuda_graph
|
32
|
+
server_args.disable_cuda_graph = True
|
33
|
+
super().__init__(
|
34
|
+
gpu_id=gpu_id,
|
35
|
+
tp_rank=tp_rank,
|
36
|
+
server_args=server_args,
|
37
|
+
nccl_port=nccl_port,
|
38
|
+
dp_rank=dp_rank,
|
39
|
+
is_draft_worker=True,
|
40
|
+
)
|
41
|
+
self.target_worker = target_worker
|
42
|
+
self.server_args = server_args
|
43
|
+
|
44
|
+
# Share the embedding and lm_head
|
45
|
+
embed, head = self.target_worker.model_runner.model.get_embed_and_head()
|
46
|
+
self.model_runner.model.set_embed_and_head(embed, head)
|
47
|
+
self.model_runner.server_args.disable_cuda_graph = backup_disable_cuda_graph
|
48
|
+
self.model_runner.init_cuda_graphs()
|
49
|
+
|
50
|
+
def forward_draft_decode(self, batch: ScheduleBatch):
|
51
|
+
batch.spec_info.prepare_for_decode(batch)
|
52
|
+
model_worker_batch = batch.get_model_worker_batch()
|
53
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
54
|
+
forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
|
55
|
+
logits_output = self.model_runner.forward(forward_batch)
|
56
|
+
self.capture_for_decode(logits_output, forward_batch)
|
57
|
+
|
58
|
+
def forward_draft_extend(self, batch: ScheduleBatch):
|
59
|
+
self._set_mem_pool(batch, self.model_runner)
|
60
|
+
batch.spec_info.prepare_for_extend(batch)
|
61
|
+
model_worker_batch = batch.get_model_worker_batch()
|
62
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
63
|
+
forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
|
64
|
+
logits_output = self.model_runner.forward(forward_batch)
|
65
|
+
self.capture_for_decode(logits_output, forward_batch)
|
66
|
+
self._set_mem_pool(batch, self.target_worker.model_runner)
|
67
|
+
|
68
|
+
def forward_batch_speculative_generation(self, batch: ScheduleBatch):
|
69
|
+
if batch.forward_mode.is_decode():
|
70
|
+
# Draft
|
71
|
+
self._set_mem_pool(batch, self.model_runner)
|
72
|
+
for i in range(self.server_args.speculative_num_steps):
|
73
|
+
self.forward_draft_decode(batch)
|
74
|
+
batch.spec_info.clear_draft_cache(batch)
|
75
|
+
self._set_mem_pool(batch, self.target_worker.model_runner)
|
76
|
+
|
77
|
+
# Verify
|
78
|
+
(
|
79
|
+
next_draft_input,
|
80
|
+
logits_output,
|
81
|
+
verified_id,
|
82
|
+
self.finish_extend_len,
|
83
|
+
accept_length_cpu,
|
84
|
+
model_worker_batch,
|
85
|
+
) = self.verify(batch)
|
86
|
+
next_draft_input.load_server_args(self.server_args)
|
87
|
+
batch.spec_info = next_draft_input
|
88
|
+
# if it is None, means all requsets are finished
|
89
|
+
if batch.spec_info.verified_id is not None:
|
90
|
+
self.forward_draft_extend_after_decode(batch)
|
91
|
+
return (
|
92
|
+
logits_output,
|
93
|
+
verified_id,
|
94
|
+
model_worker_batch,
|
95
|
+
sum(accept_length_cpu),
|
96
|
+
)
|
97
|
+
|
98
|
+
else:
|
99
|
+
# Forward with the target model and get hidden states.
|
100
|
+
# We need the full hidden states to prefill the KV cache of the draft model.
|
101
|
+
model_worker_batch = batch.get_model_worker_batch()
|
102
|
+
model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
|
103
|
+
logits_output, next_token_ids = self.target_worker.forward_batch_generation(
|
104
|
+
model_worker_batch
|
105
|
+
)
|
106
|
+
|
107
|
+
# Forward with the draft model.
|
108
|
+
spec_info = EAGLEDraftInput()
|
109
|
+
spec_info.load_server_args(self.server_args)
|
110
|
+
spec_info.hidden_states = logits_output.hidden_states
|
111
|
+
spec_info.verified_id = next_token_ids
|
112
|
+
batch.spec_info = spec_info
|
113
|
+
self.forward_draft_extend(batch)
|
114
|
+
return logits_output, next_token_ids, model_worker_batch, 0
|
115
|
+
|
116
|
+
def verify(self, batch: ScheduleBatch):
|
117
|
+
verify_input = batch.spec_info.prepare_for_verify(batch)
|
118
|
+
verify_input.prepare_for_verify(batch)
|
119
|
+
batch.forward_mode = ForwardMode.TARGET_VERIFY
|
120
|
+
batch.spec_info = verify_input
|
121
|
+
batch.spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
|
122
|
+
model_worker_batch = batch.get_model_worker_batch()
|
123
|
+
logits_output, _ = self.target_worker.forward_batch_generation(
|
124
|
+
model_worker_batch, skip_sample=True
|
125
|
+
)
|
126
|
+
verify_input.hidden_states = logits_output.hidden_states
|
127
|
+
res = verify_input.verify(batch, logits_output)
|
128
|
+
batch.forward_mode = ForwardMode.DECODE
|
129
|
+
return res + (model_worker_batch,)
|
130
|
+
|
131
|
+
def _set_mem_pool(self, batch: ScheduleBatch, runner: ModelRunner):
|
132
|
+
batch.token_to_kv_pool = runner.token_to_kv_pool
|
133
|
+
batch.req_to_token_pool = runner.req_to_token_pool
|
134
|
+
|
135
|
+
def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
|
136
|
+
self._set_mem_pool(batch, self.model_runner)
|
137
|
+
batch.forward_mode = ForwardMode.DRAFT_EXTEND
|
138
|
+
if batch.spec_info.has_finished:
|
139
|
+
index = batch.spec_info.unfinished_index
|
140
|
+
seq_lens = batch.seq_lens
|
141
|
+
batch.seq_lens = batch.seq_lens[index]
|
142
|
+
|
143
|
+
batch.spec_info.prepare_extend_after_decode(batch)
|
144
|
+
model_worker_batch = batch.get_model_worker_batch()
|
145
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
|
146
|
+
forward_batch.capture_hidden_mode = CaptureHiddenMode.LAST
|
147
|
+
logits_output = self.model_runner.forward(forward_batch)
|
148
|
+
|
149
|
+
batch.spec_info.hidden_states = logits_output.hidden_states
|
150
|
+
self.capture_for_decode(logits_output, forward_batch)
|
151
|
+
batch.forward_mode = ForwardMode.DECODE
|
152
|
+
if batch.spec_info.has_finished:
|
153
|
+
batch.seq_lens = seq_lens
|
154
|
+
self._set_mem_pool(batch, self.target_worker.model_runner)
|
155
|
+
|
156
|
+
def capture_for_decode(
|
157
|
+
self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
|
158
|
+
):
|
159
|
+
sample_output = torch.softmax(
|
160
|
+
logits_output.next_token_logits, dim=-1
|
161
|
+
) # TODO(kavioyu): Support more sampling methods
|
162
|
+
spec_info = forward_batch.spec_info
|
163
|
+
spec_info.sample_output = sample_output
|
164
|
+
spec_info.hidden_states = logits_output.hidden_states
|
165
|
+
spec_info.prev_mode = forward_batch.forward_mode
|
166
|
+
|
167
|
+
# Don't support prefix share now.
|
168
|
+
def finish_request(self, reqs: Union[Req, List[Req]]):
|
169
|
+
if not isinstance(reqs, List):
|
170
|
+
reqs = [reqs]
|
171
|
+
for req in reqs:
|
172
|
+
if req.rid not in self.finish_extend_len:
|
173
|
+
continue
|
174
|
+
req_len = (
|
175
|
+
len(req.origin_input_ids)
|
176
|
+
+ len(req.output_ids)
|
177
|
+
- self.finish_extend_len[req.rid]
|
178
|
+
- 1
|
179
|
+
)
|
180
|
+
kv_indices = self.model_runner.req_to_token_pool.req_to_token[
|
181
|
+
req.req_pool_idx
|
182
|
+
][:req_len]
|
183
|
+
self.model_runner.token_to_kv_pool.free(kv_indices)
|
184
|
+
self.model_runner.req_to_token_pool.free(req.req_pool_idx)
|
@@ -2,8 +2,12 @@ from enum import IntEnum, auto
|
|
2
2
|
|
3
3
|
|
4
4
|
class SpeculativeAlgorithm(IntEnum):
|
5
|
+
NONE = auto()
|
5
6
|
EAGLE = auto()
|
6
7
|
|
8
|
+
def is_none(self):
|
9
|
+
return self == SpeculativeAlgorithm.NONE
|
10
|
+
|
7
11
|
def is_eagle(self):
|
8
12
|
return self == SpeculativeAlgorithm.EAGLE
|
9
13
|
|
@@ -11,6 +15,7 @@ class SpeculativeAlgorithm(IntEnum):
|
|
11
15
|
def from_string(name: str):
|
12
16
|
name_map = {
|
13
17
|
"EAGLE": SpeculativeAlgorithm.EAGLE,
|
18
|
+
None: SpeculativeAlgorithm.NONE,
|
14
19
|
}
|
15
20
|
return name_map[name]
|
16
21
|
|
sglang/srt/utils.py
CHANGED
@@ -15,6 +15,7 @@
|
|
15
15
|
|
16
16
|
import base64
|
17
17
|
import dataclasses
|
18
|
+
import io
|
18
19
|
import ipaddress
|
19
20
|
import itertools
|
20
21
|
import json
|
@@ -34,6 +35,7 @@ import warnings
|
|
34
35
|
from functools import lru_cache
|
35
36
|
from importlib.metadata import PackageNotFoundError, version
|
36
37
|
from io import BytesIO
|
38
|
+
from multiprocessing.reduction import ForkingPickler
|
37
39
|
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
|
38
40
|
|
39
41
|
import numpy as np
|
@@ -60,7 +62,6 @@ from triton.runtime.cache import (
|
|
60
62
|
|
61
63
|
logger = logging.getLogger(__name__)
|
62
64
|
|
63
|
-
|
64
65
|
show_time_cost = False
|
65
66
|
time_infos = {}
|
66
67
|
|
@@ -334,6 +335,8 @@ def is_port_available(port):
|
|
334
335
|
return True
|
335
336
|
except socket.error:
|
336
337
|
return False
|
338
|
+
except OverflowError:
|
339
|
+
return False
|
337
340
|
|
338
341
|
|
339
342
|
def decode_video_base64(video_base64):
|
@@ -708,13 +711,14 @@ def broadcast_pyobj(
|
|
708
711
|
data: List[Any],
|
709
712
|
rank: int,
|
710
713
|
dist_group: Optional[torch.distributed.ProcessGroup] = None,
|
714
|
+
src: int = 0,
|
711
715
|
):
|
712
716
|
"""Broadcast inputs from rank=0 to all other ranks with torch.dist backend."""
|
713
717
|
|
714
718
|
if rank == 0:
|
715
719
|
if len(data) == 0:
|
716
720
|
tensor_size = torch.tensor([0], dtype=torch.long)
|
717
|
-
dist.broadcast(tensor_size, src=
|
721
|
+
dist.broadcast(tensor_size, src=src, group=dist_group)
|
718
722
|
else:
|
719
723
|
serialized_data = pickle.dumps(data)
|
720
724
|
size = len(serialized_data)
|
@@ -723,19 +727,19 @@ def broadcast_pyobj(
|
|
723
727
|
)
|
724
728
|
tensor_size = torch.tensor([size], dtype=torch.long)
|
725
729
|
|
726
|
-
dist.broadcast(tensor_size, src=
|
727
|
-
dist.broadcast(tensor_data, src=
|
730
|
+
dist.broadcast(tensor_size, src=src, group=dist_group)
|
731
|
+
dist.broadcast(tensor_data, src=src, group=dist_group)
|
728
732
|
return data
|
729
733
|
else:
|
730
734
|
tensor_size = torch.tensor([0], dtype=torch.long)
|
731
|
-
dist.broadcast(tensor_size, src=
|
735
|
+
dist.broadcast(tensor_size, src=src, group=dist_group)
|
732
736
|
size = tensor_size.item()
|
733
737
|
|
734
738
|
if size == 0:
|
735
739
|
return []
|
736
740
|
|
737
741
|
tensor_data = torch.empty(size, dtype=torch.uint8)
|
738
|
-
dist.broadcast(tensor_data, src=
|
742
|
+
dist.broadcast(tensor_data, src=src, group=dist_group)
|
739
743
|
|
740
744
|
serialized_data = bytes(tensor_data.cpu().numpy())
|
741
745
|
data = pickle.loads(serialized_data)
|
@@ -1206,7 +1210,6 @@ def _cuda_device_count_stateless(cuda_visible_devices: Optional[str] = None) ->
|
|
1206
1210
|
# https://github.com/pytorch/pytorch/blob/
|
1207
1211
|
# c1cd946818442aca8c7f812b16d187ce1586c3bc/
|
1208
1212
|
# torch/cuda/__init__.py#L831C1-L831C17
|
1209
|
-
import torch.cuda
|
1210
1213
|
import torch.version
|
1211
1214
|
|
1212
1215
|
if not torch.cuda._is_compiled():
|
@@ -1335,3 +1338,40 @@ def parse_tool_response(text, tools, **kwargs):
|
|
1335
1338
|
for call_info in call_info_list
|
1336
1339
|
]
|
1337
1340
|
return text, call_info_list
|
1341
|
+
|
1342
|
+
|
1343
|
+
class MultiprocessingSerializer:
|
1344
|
+
@staticmethod
|
1345
|
+
def serialize(obj):
|
1346
|
+
buf = io.BytesIO()
|
1347
|
+
ForkingPickler(buf).dump(obj)
|
1348
|
+
buf.seek(0)
|
1349
|
+
return buf.read()
|
1350
|
+
|
1351
|
+
@staticmethod
|
1352
|
+
def deserialize(data):
|
1353
|
+
return ForkingPickler.loads(data)
|
1354
|
+
|
1355
|
+
|
1356
|
+
def debug_timing(func):
|
1357
|
+
# todo: replace with a more organized instrumentation
|
1358
|
+
def wrapper(*args, **kwargs):
|
1359
|
+
if logger.isEnabledFor(logging.DEBUG):
|
1360
|
+
tic = torch.cuda.Event(enable_timing=True)
|
1361
|
+
toc = torch.cuda.Event(enable_timing=True)
|
1362
|
+
tic.record()
|
1363
|
+
result = func(*args, **kwargs)
|
1364
|
+
toc.record()
|
1365
|
+
torch.cuda.synchronize() # Ensure all CUDA operations are complete
|
1366
|
+
elapsed = tic.elapsed_time(toc)
|
1367
|
+
indices = kwargs.get("indices", args[1] if len(args) > 1 else None)
|
1368
|
+
num_tokens = len(indices) if indices is not None else 0
|
1369
|
+
throughput = num_tokens / elapsed * 1000 if elapsed > 0 else 0
|
1370
|
+
logger.debug(
|
1371
|
+
f"Transfer time: {elapsed} ms, throughput: {throughput} tokens/s"
|
1372
|
+
)
|
1373
|
+
return result
|
1374
|
+
else:
|
1375
|
+
return func(*args, **kwargs)
|
1376
|
+
|
1377
|
+
return wrapper
|
sglang/test/test_programs.py
CHANGED
@@ -509,13 +509,35 @@ def test_hellaswag_select():
|
|
509
509
|
temperature=0,
|
510
510
|
num_threads=64,
|
511
511
|
progress_bar=True,
|
512
|
+
generator_style=False,
|
512
513
|
)
|
513
|
-
preds = [
|
514
|
+
preds = []
|
515
|
+
for i, ret in enumerate(rets):
|
516
|
+
preds.append(choices[i].index(ret["answer"]))
|
514
517
|
latency = time.time() - tic
|
515
518
|
|
516
519
|
# Compute accuracy
|
517
520
|
accuracy = np.mean(np.array(preds) == np.array(labels))
|
518
521
|
|
522
|
+
# Test generator style of run_batch
|
523
|
+
tic = time.time()
|
524
|
+
rets = few_shot_hellaswag.run_batch(
|
525
|
+
arguments,
|
526
|
+
temperature=0,
|
527
|
+
num_threads=64,
|
528
|
+
progress_bar=True,
|
529
|
+
generator_style=True,
|
530
|
+
)
|
531
|
+
preds_gen = []
|
532
|
+
for i, ret in enumerate(rets):
|
533
|
+
preds_gen.append(choices[i].index(ret["answer"]))
|
534
|
+
latency_gen = time.time() - tic
|
535
|
+
|
536
|
+
# Compute accuracy
|
537
|
+
accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
|
538
|
+
assert np.abs(accuracy_gen - accuracy) < 0.01
|
539
|
+
assert np.abs(latency_gen - latency) < 1
|
540
|
+
|
519
541
|
return accuracy, latency
|
520
542
|
|
521
543
|
|
sglang/test/test_utils.py
CHANGED
@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
|
36
36
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
37
37
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
38
38
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
39
|
-
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct
|
39
|
+
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
40
40
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
41
41
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
42
42
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
|
@@ -532,6 +532,8 @@ def run_bench_serving(
|
|
532
532
|
request_rate,
|
533
533
|
other_server_args,
|
534
534
|
dataset_name="random",
|
535
|
+
dataset_path="",
|
536
|
+
tokenizer=None,
|
535
537
|
random_input_len=4096,
|
536
538
|
random_output_len=2048,
|
537
539
|
disable_stream=False,
|
@@ -553,9 +555,9 @@ def run_bench_serving(
|
|
553
555
|
host=None,
|
554
556
|
port=None,
|
555
557
|
dataset_name=dataset_name,
|
556
|
-
dataset_path=
|
558
|
+
dataset_path=dataset_path,
|
557
559
|
model=None,
|
558
|
-
tokenizer=
|
560
|
+
tokenizer=tokenizer,
|
559
561
|
num_prompts=num_prompts,
|
560
562
|
sharegpt_output_len=None,
|
561
563
|
random_input_len=random_input_len,
|
@@ -657,16 +659,16 @@ STDERR_FILENAME = "stderr.txt"
|
|
657
659
|
STDOUT_FILENAME = "stdout.txt"
|
658
660
|
|
659
661
|
|
660
|
-
def read_output(output_lines):
|
662
|
+
def read_output(output_lines: List[str], filename: str = STDERR_FILENAME):
|
661
663
|
"""Print the output in real time with another thread."""
|
662
|
-
while not os.path.exists(
|
664
|
+
while not os.path.exists(filename):
|
663
665
|
time.sleep(1)
|
664
666
|
|
665
667
|
pt = 0
|
666
668
|
while pt >= 0:
|
667
|
-
if pt > 0 and not os.path.exists(
|
669
|
+
if pt > 0 and not os.path.exists(filename):
|
668
670
|
break
|
669
|
-
lines = open(
|
671
|
+
lines = open(filename).readlines()
|
670
672
|
for line in lines[pt:]:
|
671
673
|
print(line, end="", flush=True)
|
672
674
|
output_lines.append(line)
|
@@ -747,6 +749,33 @@ def run_and_check_memory_leak(
|
|
747
749
|
assert has_abort
|
748
750
|
|
749
751
|
|
752
|
+
def run_command_and_capture_output(command, env: Optional[dict] = None):
|
753
|
+
stdout = open(STDOUT_FILENAME, "w")
|
754
|
+
stderr = open(STDERR_FILENAME, "w")
|
755
|
+
process = subprocess.Popen(
|
756
|
+
command, stdout=stdout, stderr=stderr, env=env, text=True
|
757
|
+
)
|
758
|
+
|
759
|
+
# Launch a thread to stream the output
|
760
|
+
output_lines = []
|
761
|
+
t = threading.Thread(target=read_output, args=(output_lines, STDOUT_FILENAME))
|
762
|
+
t.start()
|
763
|
+
|
764
|
+
# Join the process
|
765
|
+
process.wait()
|
766
|
+
|
767
|
+
stdout.close()
|
768
|
+
stderr.close()
|
769
|
+
if os.path.exists(STDOUT_FILENAME):
|
770
|
+
os.remove(STDOUT_FILENAME)
|
771
|
+
if os.path.exists(STDERR_FILENAME):
|
772
|
+
os.remove(STDERR_FILENAME)
|
773
|
+
kill_process_tree(process.pid)
|
774
|
+
t.join()
|
775
|
+
|
776
|
+
return output_lines
|
777
|
+
|
778
|
+
|
750
779
|
def run_mmlu_test(
|
751
780
|
disable_radix_cache=False,
|
752
781
|
enable_mixed_chunk=False,
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.1.
|
1
|
+
__version__ = "0.4.1.post5"
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.1.
|
3
|
+
Version: 0.4.1.post5
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -239,15 +239,15 @@ Requires-Dist: uvloop; extra == "runtime-common"
|
|
239
239
|
Requires-Dist: xgrammar>=0.1.6; extra == "runtime-common"
|
240
240
|
Provides-Extra: srt
|
241
241
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
242
|
+
Requires-Dist: cuda-python; extra == "srt"
|
243
|
+
Requires-Dist: sgl-kernel>=0.0.2.post11; extra == "srt"
|
242
244
|
Requires-Dist: torch; extra == "srt"
|
243
245
|
Requires-Dist: vllm<=0.6.4.post1,>=0.6.3.post1; extra == "srt"
|
244
|
-
Requires-Dist: cuda-python; extra == "srt"
|
245
246
|
Requires-Dist: flashinfer==0.1.6; extra == "srt"
|
246
|
-
Requires-Dist: sgl-kernel>=0.0.2.post10; extra == "srt"
|
247
247
|
Provides-Extra: srt-hip
|
248
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
249
249
|
Requires-Dist: torch; extra == "srt-hip"
|
250
|
-
Requires-Dist: vllm==0.6.3.
|
250
|
+
Requires-Dist: vllm==0.6.3.post2.dev1; extra == "srt-hip"
|
251
251
|
Provides-Extra: srt-xpu
|
252
252
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
253
253
|
Provides-Extra: srt-hpu
|
@@ -315,7 +315,7 @@ Requires-Dist: sglang[test]; extra == "dev-hpu"
|
|
315
315
|
|
316
316
|
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/)
|
317
317
|
| [**Documentation**](https://sgl-project.github.io/)
|
318
|
-
| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-
|
318
|
+
| [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2um0ad92q-LkU19KQTxCGzlCgRiOiQEw)
|
319
319
|
| [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing)
|
320
320
|
| [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
321
321
|
|
@@ -347,12 +347,13 @@ The core features include:
|
|
347
347
|
|
348
348
|
## Getting Started
|
349
349
|
- [Install SGLang](https://sgl-project.github.io/start/install.html)
|
350
|
-
- [
|
351
|
-
- [Backend
|
352
|
-
- [Frontend
|
350
|
+
- [Quick Start](https://sgl-project.github.io/start/send_request.html)
|
351
|
+
- [Backend Tutorial](https://sgl-project.github.io/backend/openai_api_completions.html)
|
352
|
+
- [Frontend Tutorial](https://sgl-project.github.io/frontend/frontend.html)
|
353
|
+
- [Contribution Guide](https://sgl-project.github.io/references/contribution_guide.html)
|
353
354
|
|
354
355
|
## Benchmark and Performance
|
355
|
-
Learn more in
|
356
|
+
Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)
|
356
357
|
|
357
358
|
## Roadmap
|
358
359
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
@@ -361,5 +362,4 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
361
362
|
The project is supported by (alphabetically): AMD, Baseten, DataCrunch, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, LMSYS.org, Meituan, NVIDIA, RunPod, Stanford, UC Berkeley, UCLA, xAI, 01.AI.
|
362
363
|
|
363
364
|
## Acknowledgment and Citation
|
364
|
-
We learned
|
365
|
-
Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
365
|
+
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|