sglang 0.3.3__py3-none-any.whl → 0.3.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +3 -3
- sglang/srt/conversation.py +11 -2
- sglang/srt/managers/data_parallel_controller.py +177 -0
- sglang/srt/managers/io_struct.py +7 -2
- sglang/srt/managers/schedule_batch.py +6 -0
- sglang/srt/managers/scheduler.py +46 -5
- sglang/srt/managers/tokenizer_manager.py +9 -0
- sglang/srt/model_executor/model_runner.py +40 -35
- sglang/srt/openai_api/adapter.py +5 -3
- sglang/srt/sampling/sampling_batch_info.py +19 -7
- sglang/srt/server.py +55 -20
- sglang/srt/server_args.py +14 -11
- sglang/srt/utils.py +26 -11
- sglang/version.py +1 -1
- {sglang-0.3.3.dist-info → sglang-0.3.3.post1.dist-info}/METADATA +13 -6
- {sglang-0.3.3.dist-info → sglang-0.3.3.post1.dist-info}/RECORD +19 -18
- {sglang-0.3.3.dist-info → sglang-0.3.3.post1.dist-info}/LICENSE +0 -0
- {sglang-0.3.3.dist-info → sglang-0.3.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.3.3.dist-info → sglang-0.3.3.post1.dist-info}/top_level.txt +0 -0
sglang/bench_latency.py
CHANGED
@@ -139,7 +139,7 @@ def load_model(server_args, port_args, tp_rank):
|
|
139
139
|
gpu_id=tp_rank,
|
140
140
|
tp_rank=tp_rank,
|
141
141
|
tp_size=server_args.tp_size,
|
142
|
-
nccl_port=port_args.
|
142
|
+
nccl_port=port_args.nccl_port,
|
143
143
|
server_args=server_args,
|
144
144
|
)
|
145
145
|
rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
|
@@ -220,6 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
220
220
|
return reqs
|
221
221
|
|
222
222
|
|
223
|
+
@torch.inference_mode()
|
223
224
|
def extend(reqs, model_runner):
|
224
225
|
batch = ScheduleBatch.init_new(
|
225
226
|
reqs=reqs,
|
@@ -235,6 +236,7 @@ def extend(reqs, model_runner):
|
|
235
236
|
return next_token_ids, logits_output.next_token_logits, batch
|
236
237
|
|
237
238
|
|
239
|
+
@torch.inference_mode()
|
238
240
|
def decode(input_token_ids, batch, model_runner):
|
239
241
|
batch.prepare_for_decode(input_token_ids)
|
240
242
|
model_worker_batch = batch.get_model_worker_batch()
|
@@ -244,7 +246,6 @@ def decode(input_token_ids, batch, model_runner):
|
|
244
246
|
return next_token_ids, logits_output.next_token_logits
|
245
247
|
|
246
248
|
|
247
|
-
@torch.inference_mode()
|
248
249
|
def correctness_test(
|
249
250
|
server_args,
|
250
251
|
port_args,
|
@@ -287,7 +288,6 @@ def correctness_test(
|
|
287
288
|
rank_print(tokenizer.decode(output_ids[i]), "\n")
|
288
289
|
|
289
290
|
|
290
|
-
@torch.inference_mode()
|
291
291
|
def latency_test_run_once(
|
292
292
|
run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
|
293
293
|
):
|
sglang/srt/conversation.py
CHANGED
@@ -70,6 +70,9 @@ class Conversation:
|
|
70
70
|
sep2: str = None
|
71
71
|
# Stop criteria (the default one is EOS token)
|
72
72
|
stop_str: Union[str, List[str]] = None
|
73
|
+
# The string that represents an image token in the prompt
|
74
|
+
image_token: str = "<image>"
|
75
|
+
|
73
76
|
image_data: Optional[List[str]] = None
|
74
77
|
modalities: Optional[List[str]] = None
|
75
78
|
|
@@ -334,6 +337,7 @@ class Conversation:
|
|
334
337
|
sep=self.sep,
|
335
338
|
sep2=self.sep2,
|
336
339
|
stop_str=self.stop_str,
|
340
|
+
image_token=self.image_token,
|
337
341
|
)
|
338
342
|
|
339
343
|
def dict(self):
|
@@ -381,6 +385,7 @@ def generate_chat_conv(
|
|
381
385
|
stop_str=conv.stop_str,
|
382
386
|
image_data=[],
|
383
387
|
modalities=[],
|
388
|
+
image_token=conv.image_token,
|
384
389
|
)
|
385
390
|
|
386
391
|
if isinstance(request.messages, str):
|
@@ -412,9 +417,13 @@ def generate_chat_conv(
|
|
412
417
|
num_image_url += 1
|
413
418
|
conv.modalities.append(content.modalities)
|
414
419
|
if num_image_url > 1:
|
415
|
-
image_token =
|
420
|
+
image_token = conv.image_token
|
416
421
|
else:
|
417
|
-
image_token =
|
422
|
+
image_token = (
|
423
|
+
conv.image_token + "\n"
|
424
|
+
if conv.name != "qwen2-vl"
|
425
|
+
else conv.image_token
|
426
|
+
)
|
418
427
|
for content in message.content:
|
419
428
|
if content.type == "text":
|
420
429
|
if num_image_url > 16:
|
@@ -0,0 +1,177 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
"""A controller that dispatches requests to multiple data parallel workers."""
|
17
|
+
|
18
|
+
import logging
|
19
|
+
import multiprocessing as mp
|
20
|
+
from enum import Enum, auto
|
21
|
+
|
22
|
+
import zmq
|
23
|
+
|
24
|
+
from sglang.srt.managers.io_struct import (
|
25
|
+
TokenizedEmbeddingReqInput,
|
26
|
+
TokenizedGenerateReqInput,
|
27
|
+
TokenizedRewardReqInput,
|
28
|
+
)
|
29
|
+
from sglang.srt.managers.scheduler import run_scheduler_process
|
30
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
31
|
+
from sglang.srt.utils import (
|
32
|
+
configure_logger,
|
33
|
+
kill_parent_process,
|
34
|
+
suppress_other_loggers,
|
35
|
+
)
|
36
|
+
from sglang.utils import get_exception_traceback
|
37
|
+
|
38
|
+
logger = logging.getLogger(__name__)
|
39
|
+
|
40
|
+
|
41
|
+
class LoadBalanceMethod(Enum):
|
42
|
+
"""Load balance method."""
|
43
|
+
|
44
|
+
ROUND_ROBIN = auto()
|
45
|
+
SHORTEST_QUEUE = auto()
|
46
|
+
|
47
|
+
@classmethod
|
48
|
+
def from_str(cls, method: str):
|
49
|
+
method = method.upper()
|
50
|
+
try:
|
51
|
+
return cls[method]
|
52
|
+
except KeyError as exc:
|
53
|
+
raise ValueError(f"Invalid load balance method: {method}") from exc
|
54
|
+
|
55
|
+
|
56
|
+
class DataParallelController:
|
57
|
+
"""A controller that dispatches requests to multiple data parallel workers."""
|
58
|
+
|
59
|
+
def __init__(self, server_args, port_args) -> None:
|
60
|
+
# Parse args
|
61
|
+
self.server_args = server_args
|
62
|
+
self.port_args = port_args
|
63
|
+
self.load_balance_method = LoadBalanceMethod.from_str(
|
64
|
+
server_args.load_balance_method
|
65
|
+
)
|
66
|
+
|
67
|
+
# Init inter-process communication
|
68
|
+
self.context = zmq.Context(1 + server_args.dp_size)
|
69
|
+
self.recv_from_tokenizer = self.context.socket(zmq.PULL)
|
70
|
+
self.recv_from_tokenizer.bind(f"ipc://{port_args.scheduler_input_ipc_name}")
|
71
|
+
|
72
|
+
# Dispatch method
|
73
|
+
self.round_robin_counter = 0
|
74
|
+
dispatch_lookup = {
|
75
|
+
LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
|
76
|
+
LoadBalanceMethod.SHORTEST_QUEUE: self.shortest_queue_scheduler,
|
77
|
+
}
|
78
|
+
self.dispatching = dispatch_lookup[self.load_balance_method]
|
79
|
+
|
80
|
+
# Start data parallel workers
|
81
|
+
base_gpu_id = 0
|
82
|
+
self.workers = []
|
83
|
+
for dp_rank in range(server_args.dp_size):
|
84
|
+
tmp_port_args = PortArgs.init_new(server_args)
|
85
|
+
tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
|
86
|
+
|
87
|
+
send_to = self.launch_tensor_parallel_group(
|
88
|
+
server_args,
|
89
|
+
tmp_port_args,
|
90
|
+
base_gpu_id,
|
91
|
+
dp_rank,
|
92
|
+
)
|
93
|
+
|
94
|
+
self.workers.append(send_to)
|
95
|
+
base_gpu_id += server_args.tp_size
|
96
|
+
|
97
|
+
def launch_tensor_parallel_group(
|
98
|
+
self,
|
99
|
+
server_args: ServerArgs,
|
100
|
+
port_args: PortArgs,
|
101
|
+
base_gpu_id: int,
|
102
|
+
dp_rank: int,
|
103
|
+
):
|
104
|
+
# Launch tensor parallel scheduler processes
|
105
|
+
scheduler_procs = []
|
106
|
+
scheduler_pipe_readers = []
|
107
|
+
tp_size_per_node = server_args.tp_size // server_args.nnodes
|
108
|
+
tp_rank_range = range(
|
109
|
+
tp_size_per_node * server_args.node_rank,
|
110
|
+
tp_size_per_node * (server_args.node_rank + 1),
|
111
|
+
)
|
112
|
+
for tp_rank in tp_rank_range:
|
113
|
+
reader, writer = mp.Pipe(duplex=False)
|
114
|
+
gpu_id = base_gpu_id + tp_rank % tp_size_per_node
|
115
|
+
proc = mp.Process(
|
116
|
+
target=run_scheduler_process,
|
117
|
+
args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
|
118
|
+
)
|
119
|
+
proc.start()
|
120
|
+
scheduler_procs.append(proc)
|
121
|
+
scheduler_pipe_readers.append(reader)
|
122
|
+
|
123
|
+
send_to = self.context.socket(zmq.PUSH)
|
124
|
+
send_to.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
|
125
|
+
|
126
|
+
# Wait for model to finish loading
|
127
|
+
for i in range(len(scheduler_pipe_readers)):
|
128
|
+
scheduler_pipe_readers[i].recv()
|
129
|
+
|
130
|
+
return send_to
|
131
|
+
|
132
|
+
def round_robin_scheduler(self, req):
|
133
|
+
self.workers[self.round_robin_counter].send_pyobj(req)
|
134
|
+
self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
|
135
|
+
|
136
|
+
def shortest_queue_scheduler(self, input_requests):
|
137
|
+
raise NotImplementedError()
|
138
|
+
|
139
|
+
def event_loop(self):
|
140
|
+
while True:
|
141
|
+
while True:
|
142
|
+
try:
|
143
|
+
recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
|
144
|
+
except zmq.ZMQError:
|
145
|
+
break
|
146
|
+
|
147
|
+
if isinstance(
|
148
|
+
recv_req,
|
149
|
+
(
|
150
|
+
TokenizedGenerateReqInput,
|
151
|
+
TokenizedEmbeddingReqInput,
|
152
|
+
TokenizedRewardReqInput,
|
153
|
+
),
|
154
|
+
):
|
155
|
+
self.dispatching(recv_req)
|
156
|
+
else:
|
157
|
+
# Send other control messages to all workers
|
158
|
+
for worker in self.workers:
|
159
|
+
worker.queue.put(recv_req)
|
160
|
+
|
161
|
+
|
162
|
+
def run_data_parallel_controller_process(
|
163
|
+
server_args: ServerArgs,
|
164
|
+
port_args: PortArgs,
|
165
|
+
pipe_writer,
|
166
|
+
):
|
167
|
+
configure_logger(server_args)
|
168
|
+
suppress_other_loggers()
|
169
|
+
|
170
|
+
try:
|
171
|
+
controller = DataParallelController(server_args, port_args)
|
172
|
+
pipe_writer.send("ready")
|
173
|
+
controller.event_loop()
|
174
|
+
except Exception:
|
175
|
+
msg = get_exception_traceback()
|
176
|
+
logger.error(msg)
|
177
|
+
kill_parent_process()
|
sglang/srt/managers/io_struct.py
CHANGED
@@ -20,6 +20,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
|
|
20
20
|
|
21
21
|
import uuid
|
22
22
|
from dataclasses import dataclass
|
23
|
+
from enum import Enum
|
23
24
|
from typing import Dict, List, Optional, Union
|
24
25
|
|
25
26
|
from sglang.srt.managers.schedule_batch import BaseFinishReason
|
@@ -119,8 +120,7 @@ class GenerateReqInput:
|
|
119
120
|
elif not isinstance(self.image_data, list):
|
120
121
|
self.image_data = [self.image_data] * num
|
121
122
|
elif isinstance(self.image_data, list):
|
122
|
-
|
123
|
-
self.image_data = self.image_data * num
|
123
|
+
pass
|
124
124
|
|
125
125
|
if self.sampling_params is None:
|
126
126
|
self.sampling_params = [{}] * num
|
@@ -344,3 +344,8 @@ class UpdateWeightReqOutput:
|
|
344
344
|
class AbortReq:
|
345
345
|
# The request id
|
346
346
|
rid: str
|
347
|
+
|
348
|
+
|
349
|
+
class ProfileReq(Enum):
|
350
|
+
START_PROFILE = 1
|
351
|
+
STOP_PROFILE = 2
|
@@ -423,6 +423,9 @@ class ScheduleBatch:
|
|
423
423
|
# Stream
|
424
424
|
has_stream: bool = False
|
425
425
|
|
426
|
+
# device
|
427
|
+
device: str = "cuda"
|
428
|
+
|
426
429
|
# Has regex
|
427
430
|
has_regex: bool = False
|
428
431
|
|
@@ -439,6 +442,7 @@ class ScheduleBatch:
|
|
439
442
|
tree_cache=tree_cache,
|
440
443
|
return_logprob=return_logprob,
|
441
444
|
has_stream=has_stream,
|
445
|
+
device=req_to_token_pool.device,
|
442
446
|
has_regex=has_regex,
|
443
447
|
)
|
444
448
|
|
@@ -806,6 +810,8 @@ class ScheduleBatch:
|
|
806
810
|
self.sampling_info.regex_fsm_states = [
|
807
811
|
req.regex_fsm_state for req in self.reqs
|
808
812
|
]
|
813
|
+
else:
|
814
|
+
self.sampling_info.regex_fsms = None
|
809
815
|
|
810
816
|
return ModelWorkerBatch(
|
811
817
|
forward_mode=self.forward_mode,
|
sglang/srt/managers/scheduler.py
CHANGED
@@ -37,6 +37,7 @@ from sglang.srt.managers.io_struct import (
|
|
37
37
|
BatchEmbeddingOut,
|
38
38
|
BatchTokenIDOut,
|
39
39
|
FlushCacheReq,
|
40
|
+
ProfileReq,
|
40
41
|
TokenizedEmbeddingReqInput,
|
41
42
|
TokenizedGenerateReqInput,
|
42
43
|
TokenizedRewardReqInput,
|
@@ -141,7 +142,7 @@ class Scheduler:
|
|
141
142
|
gpu_id=gpu_id,
|
142
143
|
tp_rank=tp_rank,
|
143
144
|
server_args=server_args,
|
144
|
-
nccl_port=port_args.
|
145
|
+
nccl_port=port_args.nccl_port,
|
145
146
|
)
|
146
147
|
self.tp_cpu_group = self.tp_worker.model_runner.tp_group.cpu_group
|
147
148
|
|
@@ -229,6 +230,22 @@ class Scheduler:
|
|
229
230
|
self.new_token_ratio_decay = global_config.new_token_ratio_decay
|
230
231
|
self.batch_is_full = False
|
231
232
|
|
233
|
+
if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
|
234
|
+
self.profiler = None
|
235
|
+
else:
|
236
|
+
self.torch_profiler_trace_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
|
237
|
+
logger.info(
|
238
|
+
"Profiling enabled. Traces will be saved to: %s",
|
239
|
+
self.torch_profiler_trace_dir,
|
240
|
+
)
|
241
|
+
self.profiler = torch.profiler.profile(
|
242
|
+
activities=[
|
243
|
+
torch.profiler.ProfilerActivity.CPU,
|
244
|
+
torch.profiler.ProfilerActivity.CUDA,
|
245
|
+
],
|
246
|
+
with_stack=True,
|
247
|
+
)
|
248
|
+
|
232
249
|
@torch.inference_mode()
|
233
250
|
def event_loop(self):
|
234
251
|
while True:
|
@@ -271,6 +288,11 @@ class Scheduler:
|
|
271
288
|
elif isinstance(recv_req, UpdateWeightReqInput):
|
272
289
|
success, message = self.update_weights(recv_req)
|
273
290
|
self.out_pyobjs.append(UpdateWeightReqOutput(success, message))
|
291
|
+
elif isinstance(recv_req, ProfileReq):
|
292
|
+
if recv_req == ProfileReq.START_PROFILE:
|
293
|
+
self.start_profile()
|
294
|
+
else:
|
295
|
+
self.stop_profile()
|
274
296
|
else:
|
275
297
|
raise ValueError(f"Invalid request: {recv_req}")
|
276
298
|
|
@@ -433,6 +455,9 @@ class Scheduler:
|
|
433
455
|
result = self.run_batch(batch)
|
434
456
|
self.process_batch_result(batch, result)
|
435
457
|
|
458
|
+
if self.running_batch.is_empty():
|
459
|
+
self.running_batch = None
|
460
|
+
|
436
461
|
if self.running_batch is None:
|
437
462
|
break
|
438
463
|
|
@@ -772,9 +797,6 @@ class Scheduler:
|
|
772
797
|
if self.tp_rank == 0 and self.decode_forward_ct % 40 == 0:
|
773
798
|
self.print_decode_stats()
|
774
799
|
|
775
|
-
if self.running_batch.is_empty():
|
776
|
-
self.running_batch = None
|
777
|
-
|
778
800
|
def add_logprob_return_values(
|
779
801
|
self,
|
780
802
|
i: int,
|
@@ -1000,15 +1022,34 @@ class Scheduler:
|
|
1000
1022
|
logger.error(message)
|
1001
1023
|
return success, message
|
1002
1024
|
|
1025
|
+
def start_profile(self) -> None:
|
1026
|
+
if self.profiler is None:
|
1027
|
+
raise RuntimeError("Profiler is not enabled.")
|
1028
|
+
self.profiler.start()
|
1029
|
+
|
1030
|
+
def stop_profile(self) -> None:
|
1031
|
+
if self.profiler is None:
|
1032
|
+
raise RuntimeError("Profiler is not enabled.")
|
1033
|
+
self.profiler.stop()
|
1034
|
+
self.profiler.export_chrome_trace(
|
1035
|
+
self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
|
1036
|
+
)
|
1037
|
+
logger.info("Profiler is done")
|
1038
|
+
|
1003
1039
|
|
1004
1040
|
def run_scheduler_process(
|
1005
1041
|
server_args: ServerArgs,
|
1006
1042
|
port_args: PortArgs,
|
1007
1043
|
gpu_id: int,
|
1008
1044
|
tp_rank: int,
|
1045
|
+
dp_rank: Optional[int],
|
1009
1046
|
pipe_writer,
|
1010
1047
|
):
|
1011
|
-
|
1048
|
+
if dp_rank is None:
|
1049
|
+
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
1050
|
+
else:
|
1051
|
+
configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
|
1052
|
+
|
1012
1053
|
suppress_other_loggers()
|
1013
1054
|
|
1014
1055
|
try:
|
@@ -46,6 +46,7 @@ from sglang.srt.managers.io_struct import (
|
|
46
46
|
EmbeddingReqInput,
|
47
47
|
FlushCacheReq,
|
48
48
|
GenerateReqInput,
|
49
|
+
ProfileReq,
|
49
50
|
RewardReqInput,
|
50
51
|
TokenizedEmbeddingReqInput,
|
51
52
|
TokenizedGenerateReqInput,
|
@@ -512,6 +513,14 @@ class TokenizerManager:
|
|
512
513
|
req = AbortReq(rid)
|
513
514
|
self.send_to_scheduler.send_pyobj(req)
|
514
515
|
|
516
|
+
def start_profile(self):
|
517
|
+
req = ProfileReq.START_PROFILE
|
518
|
+
self.send_to_scheduler.send_pyobj(req)
|
519
|
+
|
520
|
+
def stop_profile(self):
|
521
|
+
req = ProfileReq.STOP_PROFILE
|
522
|
+
self.send_to_scheduler.send_pyobj(req)
|
523
|
+
|
515
524
|
async def update_weights(
|
516
525
|
self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
|
517
526
|
):
|
@@ -81,10 +81,11 @@ class ModelRunner:
|
|
81
81
|
# Parse args
|
82
82
|
self.model_config = model_config
|
83
83
|
self.mem_fraction_static = mem_fraction_static
|
84
|
+
self.device = server_args.device
|
84
85
|
self.gpu_id = gpu_id
|
85
86
|
self.tp_rank = tp_rank
|
86
87
|
self.tp_size = tp_size
|
87
|
-
self.
|
88
|
+
self.dist_port = nccl_port
|
88
89
|
self.server_args = server_args
|
89
90
|
self.is_multimodal_model = is_multimodal_model(
|
90
91
|
self.model_config.hf_config.architectures
|
@@ -95,7 +96,7 @@ class ModelRunner:
|
|
95
96
|
self.model_config.attention_arch == AttentionArch.MLA
|
96
97
|
and not self.server_args.disable_mla
|
97
98
|
):
|
98
|
-
logger.info("MLA optimization is
|
99
|
+
logger.info("MLA optimization is turned on. Use triton backend.")
|
99
100
|
self.server_args.attention_backend = "triton"
|
100
101
|
|
101
102
|
if self.is_multimodal_model:
|
@@ -132,39 +133,45 @@ class ModelRunner:
|
|
132
133
|
server_args.max_running_requests,
|
133
134
|
server_args.max_total_tokens,
|
134
135
|
)
|
135
|
-
self.
|
136
|
-
|
137
|
-
|
136
|
+
if self.device == "cuda":
|
137
|
+
self.init_cublas()
|
138
|
+
self.init_attention_backend()
|
139
|
+
self.init_cuda_graphs()
|
140
|
+
else:
|
141
|
+
self.init_attention_backend()
|
138
142
|
|
139
143
|
def init_torch_distributed(self):
|
144
|
+
logger.info("Init torch distributed begin.")
|
140
145
|
# Init torch distributed
|
141
|
-
|
142
|
-
|
146
|
+
if self.device == "cuda":
|
147
|
+
torch.cuda.set_device(self.gpu_id)
|
148
|
+
backend = "nccl"
|
143
149
|
|
144
150
|
if not self.server_args.enable_p2p_check:
|
145
151
|
monkey_patch_vllm_p2p_access_check(self.gpu_id)
|
146
|
-
|
147
152
|
if self.server_args.dist_init_addr:
|
148
|
-
|
153
|
+
dist_init_method = f"tcp://{self.server_args.dist_init_addr}"
|
149
154
|
else:
|
150
|
-
|
155
|
+
dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
|
151
156
|
set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
|
152
157
|
init_distributed_environment(
|
153
|
-
backend=
|
158
|
+
backend=backend,
|
154
159
|
world_size=self.tp_size,
|
155
160
|
rank=self.tp_rank,
|
156
161
|
local_rank=self.gpu_id,
|
157
|
-
distributed_init_method=
|
162
|
+
distributed_init_method=dist_init_method,
|
158
163
|
)
|
159
164
|
initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
|
160
165
|
min_per_gpu_memory = get_available_gpu_memory(
|
161
|
-
self.gpu_id, distributed=self.tp_size > 1
|
166
|
+
self.device, self.gpu_id, distributed=self.tp_size > 1
|
162
167
|
)
|
163
168
|
self.tp_group = get_tp_group()
|
164
169
|
|
165
170
|
# Currently, there is a bug with mulit-node tensor parallelsim + padded cuda graph,
|
166
171
|
# so we disable padding in cuda graph.
|
167
|
-
if not all(
|
172
|
+
if self.device == "cuda" and not all(
|
173
|
+
in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)
|
174
|
+
):
|
168
175
|
self.server_args.disable_cuda_graph_padding = True
|
169
176
|
logger.info(
|
170
177
|
"Setting disable_cuda_graph_padding to True because of multi-node tensor parallelism."
|
@@ -172,7 +179,7 @@ class ModelRunner:
|
|
172
179
|
|
173
180
|
# Check memory for tensor parallelism
|
174
181
|
if self.tp_size > 1:
|
175
|
-
local_gpu_memory = get_available_gpu_memory(self.gpu_id)
|
182
|
+
local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
|
176
183
|
if min_per_gpu_memory < local_gpu_memory * 0.9:
|
177
184
|
raise ValueError(
|
178
185
|
"The memory capacity is unbalanced. Some GPUs may be occupied by other processes."
|
@@ -182,23 +189,22 @@ class ModelRunner:
|
|
182
189
|
|
183
190
|
def load_model(self):
|
184
191
|
logger.info(
|
185
|
-
f"Load weight begin. avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
192
|
+
f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
|
186
193
|
)
|
187
194
|
|
188
195
|
# This can reduce thread conflicts and speed up weight loading.
|
189
196
|
torch.set_num_threads(1)
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
197
|
+
if self.device == "cuda":
|
198
|
+
if torch.cuda.get_device_capability()[0] < 8:
|
199
|
+
logger.info(
|
200
|
+
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
|
201
|
+
)
|
202
|
+
self.server_args.dtype = "float16"
|
203
|
+
if torch.cuda.get_device_capability()[1] < 5:
|
204
|
+
raise RuntimeError("SGLang only supports sm75 and above.")
|
198
205
|
|
199
206
|
# Prepare the vllm model config
|
200
207
|
monkey_patch_vllm_dummy_weight_loader()
|
201
|
-
self.device_config = DeviceConfig()
|
202
208
|
self.load_config = LoadConfig(load_format=self.server_args.load_format)
|
203
209
|
self.vllm_model_config = VllmModelConfig(
|
204
210
|
model=self.server_args.model_path,
|
@@ -220,7 +226,7 @@ class ModelRunner:
|
|
220
226
|
self.model = get_model(
|
221
227
|
model_config=self.vllm_model_config,
|
222
228
|
load_config=self.load_config,
|
223
|
-
device_config=self.
|
229
|
+
device_config=DeviceConfig(self.device),
|
224
230
|
parallel_config=None,
|
225
231
|
scheduler_config=None,
|
226
232
|
lora_config=None,
|
@@ -240,7 +246,7 @@ class ModelRunner:
|
|
240
246
|
f"Load weight end. "
|
241
247
|
f"type={type(self.model).__name__}, "
|
242
248
|
f"dtype={self.dtype}, "
|
243
|
-
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
249
|
+
f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
|
244
250
|
)
|
245
251
|
|
246
252
|
def update_weights(self, model_path: str, load_format: str):
|
@@ -254,10 +260,10 @@ class ModelRunner:
|
|
254
260
|
|
255
261
|
logger.info(
|
256
262
|
f"Update weights begin. "
|
257
|
-
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
263
|
+
f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
|
258
264
|
)
|
259
265
|
|
260
|
-
target_device = torch.device(self.
|
266
|
+
target_device = torch.device(self.device)
|
261
267
|
|
262
268
|
try:
|
263
269
|
# TODO: Use a better method to check this
|
@@ -343,7 +349,7 @@ class ModelRunner:
|
|
343
349
|
|
344
350
|
def profile_max_num_token(self, total_gpu_memory: int):
|
345
351
|
available_gpu_memory = get_available_gpu_memory(
|
346
|
-
self.gpu_id, distributed=self.tp_size > 1
|
352
|
+
self.device, self.gpu_id, distributed=self.tp_size > 1
|
347
353
|
)
|
348
354
|
if (
|
349
355
|
self.model_config.attention_arch == AttentionArch.MLA
|
@@ -409,11 +415,10 @@ class ModelRunner:
|
|
409
415
|
4096,
|
410
416
|
)
|
411
417
|
|
412
|
-
device = "cuda"
|
413
418
|
self.req_to_token_pool = ReqToTokenPool(
|
414
419
|
size=max_num_reqs + 1,
|
415
420
|
max_context_len=self.model_config.context_len + 4,
|
416
|
-
device=device,
|
421
|
+
device=self.device,
|
417
422
|
)
|
418
423
|
if (
|
419
424
|
self.model_config.attention_arch == AttentionArch.MLA
|
@@ -425,7 +430,7 @@ class ModelRunner:
|
|
425
430
|
kv_lora_rank=self.model_config.kv_lora_rank,
|
426
431
|
qk_rope_head_dim=self.model_config.qk_rope_head_dim,
|
427
432
|
layer_num=self.model_config.num_hidden_layers,
|
428
|
-
device=device,
|
433
|
+
device=self.device,
|
429
434
|
)
|
430
435
|
else:
|
431
436
|
self.token_to_kv_pool = MHATokenToKVPool(
|
@@ -434,11 +439,11 @@ class ModelRunner:
|
|
434
439
|
head_num=self.model_config.get_num_kv_heads(self.tp_size),
|
435
440
|
head_dim=self.model_config.head_dim,
|
436
441
|
layer_num=self.model_config.num_hidden_layers,
|
437
|
-
device=device,
|
442
|
+
device=self.device,
|
438
443
|
)
|
439
444
|
logger.info(
|
440
445
|
f"Memory pool end. "
|
441
|
-
f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
|
446
|
+
f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
|
442
447
|
)
|
443
448
|
|
444
449
|
def init_cublas(self):
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -117,7 +117,9 @@ def create_streaming_error_response(
|
|
117
117
|
def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
|
118
118
|
global chat_template_name
|
119
119
|
|
120
|
-
logger.info(
|
120
|
+
logger.info(
|
121
|
+
f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}"
|
122
|
+
)
|
121
123
|
if not chat_template_exists(chat_template_arg):
|
122
124
|
if not os.path.exists(chat_template_arg):
|
123
125
|
raise RuntimeError(
|
@@ -924,7 +926,7 @@ def v1_chat_generate_request(
|
|
924
926
|
else:
|
925
927
|
prompt_kwargs = {"input_ids": input_ids}
|
926
928
|
sampling_params_list = sampling_params_list[0]
|
927
|
-
|
929
|
+
image_data_list = image_data_list[0]
|
928
930
|
return_logprobs = return_logprobs[0]
|
929
931
|
logprob_start_lens = logprob_start_lens[0]
|
930
932
|
top_logprobs_nums = top_logprobs_nums[0]
|
@@ -937,7 +939,7 @@ def v1_chat_generate_request(
|
|
937
939
|
|
938
940
|
adapted_request = GenerateReqInput(
|
939
941
|
**prompt_kwargs,
|
940
|
-
image_data=
|
942
|
+
image_data=image_data_list,
|
941
943
|
sampling_params=sampling_params_list,
|
942
944
|
return_logprob=return_logprobs,
|
943
945
|
logprob_start_len=logprob_start_lens,
|
@@ -37,6 +37,9 @@ class SamplingBatchInfo:
|
|
37
37
|
linear_penalties: torch.Tensor = None
|
38
38
|
scaling_penalties: torch.Tensor = None
|
39
39
|
|
40
|
+
# Device
|
41
|
+
device: str = "cuda"
|
42
|
+
|
40
43
|
@classmethod
|
41
44
|
def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
|
42
45
|
reqs = batch.reqs
|
@@ -62,6 +65,7 @@ class SamplingBatchInfo:
|
|
62
65
|
min_ps=min_ps,
|
63
66
|
need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
|
64
67
|
vocab_size=vocab_size,
|
68
|
+
device=batch.input_ids.device,
|
65
69
|
)
|
66
70
|
# TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
|
67
71
|
|
@@ -75,7 +79,7 @@ class SamplingBatchInfo:
|
|
75
79
|
ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
|
76
80
|
vocab_size=vocab_size,
|
77
81
|
batch=batch,
|
78
|
-
device=
|
82
|
+
device=batch.input_ids.device,
|
79
83
|
Penalizers={
|
80
84
|
penaltylib.BatchedFrequencyPenalizer,
|
81
85
|
penaltylib.BatchedMinNewTokensPenalizer,
|
@@ -107,7 +111,7 @@ class SamplingBatchInfo:
|
|
107
111
|
self.linear_penalties = torch.zeros(
|
108
112
|
(bs, self.vocab_size),
|
109
113
|
dtype=torch.float32,
|
110
|
-
device=
|
114
|
+
device=self.device,
|
111
115
|
)
|
112
116
|
self.linear_penalties = penalizer.apply(self.linear_penalties)
|
113
117
|
|
@@ -119,7 +123,10 @@ class SamplingBatchInfo:
|
|
119
123
|
|
120
124
|
if has_regex:
|
121
125
|
self.vocab_mask = torch.zeros(
|
122
|
-
len(self.temperatures),
|
126
|
+
len(self.temperatures),
|
127
|
+
self.vocab_size,
|
128
|
+
dtype=torch.bool,
|
129
|
+
device=self.device,
|
123
130
|
)
|
124
131
|
for i, regex_fsm in enumerate(self.regex_fsms):
|
125
132
|
if regex_fsm is not None:
|
@@ -144,7 +151,12 @@ class SamplingBatchInfo:
|
|
144
151
|
|
145
152
|
@staticmethod
|
146
153
|
def merge_bias_tensor(
|
147
|
-
lhs: torch.Tensor,
|
154
|
+
lhs: torch.Tensor,
|
155
|
+
rhs: torch.Tensor,
|
156
|
+
bs1: int,
|
157
|
+
bs2: int,
|
158
|
+
device: str,
|
159
|
+
default: int = 0,
|
148
160
|
):
|
149
161
|
# bias tensor can be None
|
150
162
|
if lhs is not None or rhs is not None:
|
@@ -155,9 +167,9 @@ class SamplingBatchInfo:
|
|
155
167
|
shape, dtype = rhs.shape[1:], rhs.dtype
|
156
168
|
with torch.dtype(dtype):
|
157
169
|
if lhs is None:
|
158
|
-
lhs = torch.empty((bs1, *shape), device=
|
170
|
+
lhs = torch.empty((bs1, *shape), device=device).fill_(default)
|
159
171
|
if rhs is None:
|
160
|
-
rhs = torch.empty((bs2, *shape), device=
|
172
|
+
rhs = torch.empty((bs2, *shape), device=device).fill_(default)
|
161
173
|
return torch.cat([lhs, rhs])
|
162
174
|
|
163
175
|
return None
|
@@ -176,5 +188,5 @@ class SamplingBatchInfo:
|
|
176
188
|
setattr(self, item, torch.concat([self_val, other_val]))
|
177
189
|
|
178
190
|
self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
|
179
|
-
self.logit_bias, other.logit_bias, len(self), len(other)
|
191
|
+
self.logit_bias, other.logit_bias, len(self), len(other), self.device
|
180
192
|
)
|
sglang/srt/server.py
CHANGED
@@ -44,6 +44,9 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse
|
|
44
44
|
|
45
45
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
46
46
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
47
|
+
from sglang.srt.managers.data_parallel_controller import (
|
48
|
+
run_data_parallel_controller_process,
|
49
|
+
)
|
47
50
|
from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
|
48
51
|
from sglang.srt.managers.io_struct import (
|
49
52
|
EmbeddingReqInput,
|
@@ -145,6 +148,28 @@ async def flush_cache():
|
|
145
148
|
)
|
146
149
|
|
147
150
|
|
151
|
+
@app.get("/start_profile")
|
152
|
+
@app.post("/start_profile")
|
153
|
+
async def start_profile():
|
154
|
+
"""Start profiling."""
|
155
|
+
tokenizer_manager.start_profile()
|
156
|
+
return Response(
|
157
|
+
content="Start profiling.\n",
|
158
|
+
status_code=200,
|
159
|
+
)
|
160
|
+
|
161
|
+
|
162
|
+
@app.get("/stop_profile")
|
163
|
+
@app.post("/stop_profile")
|
164
|
+
async def stop_profile():
|
165
|
+
"""Stop profiling."""
|
166
|
+
tokenizer_manager.stop_profile()
|
167
|
+
return Response(
|
168
|
+
content="Stop profiling. This will take some time.\n",
|
169
|
+
status_code=200,
|
170
|
+
)
|
171
|
+
|
172
|
+
|
148
173
|
@app.post("/update_weights")
|
149
174
|
async def update_weights(obj: UpdateWeightReqInput, request: Request):
|
150
175
|
"""Update the weights inplace without re-launching the server."""
|
@@ -315,30 +340,40 @@ def launch_engine(
|
|
315
340
|
server_args.model_path, server_args.tokenizer_path
|
316
341
|
)
|
317
342
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
343
|
+
if server_args.dp_size == 1:
|
344
|
+
# Launch tensor parallel scheduler processes
|
345
|
+
scheduler_procs = []
|
346
|
+
scheduler_pipe_readers = []
|
347
|
+
tp_size_per_node = server_args.tp_size // server_args.nnodes
|
348
|
+
tp_rank_range = range(
|
349
|
+
tp_size_per_node * server_args.node_rank,
|
350
|
+
tp_size_per_node * (server_args.node_rank + 1),
|
351
|
+
)
|
352
|
+
for tp_rank in tp_rank_range:
|
353
|
+
reader, writer = mp.Pipe(duplex=False)
|
354
|
+
gpu_id = tp_rank % tp_size_per_node
|
355
|
+
proc = mp.Process(
|
356
|
+
target=run_scheduler_process,
|
357
|
+
args=(server_args, port_args, gpu_id, tp_rank, None, writer),
|
358
|
+
)
|
359
|
+
proc.start()
|
360
|
+
scheduler_procs.append(proc)
|
361
|
+
scheduler_pipe_readers.append(reader)
|
362
|
+
|
363
|
+
if server_args.node_rank >= 1:
|
364
|
+
# For other nodes, they do not need to run tokenizer or detokenizer,
|
365
|
+
# so they can just wait here.
|
366
|
+
while True:
|
367
|
+
pass
|
368
|
+
else:
|
369
|
+
# Launch the data parallel controller
|
327
370
|
reader, writer = mp.Pipe(duplex=False)
|
328
|
-
|
371
|
+
scheduler_pipe_readers = [reader]
|
329
372
|
proc = mp.Process(
|
330
|
-
target=
|
331
|
-
args=(server_args, port_args,
|
373
|
+
target=run_data_parallel_controller_process,
|
374
|
+
args=(server_args, port_args, writer),
|
332
375
|
)
|
333
376
|
proc.start()
|
334
|
-
scheduler_procs.append(proc)
|
335
|
-
scheduler_pipe_readers.append(reader)
|
336
|
-
|
337
|
-
if server_args.node_rank >= 1:
|
338
|
-
# For other nodes, they do not need to run tokenizer or detokenizer,
|
339
|
-
# so they can just wait here.
|
340
|
-
while True:
|
341
|
-
pass
|
342
377
|
|
343
378
|
# Launch detokenizer process
|
344
379
|
detoken_proc = mp.Process(
|
sglang/srt/server_args.py
CHANGED
@@ -36,6 +36,7 @@ class ServerArgs:
|
|
36
36
|
skip_tokenizer_init: bool = False
|
37
37
|
load_format: str = "auto"
|
38
38
|
dtype: str = "auto"
|
39
|
+
device: str = "cuda"
|
39
40
|
kv_cache_dtype: str = "auto"
|
40
41
|
trust_remote_code: bool = True
|
41
42
|
context_length: Optional[int] = None
|
@@ -237,6 +238,13 @@ class ServerArgs:
|
|
237
238
|
'* "float" is shorthand for FP32 precision.\n'
|
238
239
|
'* "float32" for FP32 precision.',
|
239
240
|
)
|
241
|
+
parser.add_argument(
|
242
|
+
"--device",
|
243
|
+
type=str,
|
244
|
+
default="cuda",
|
245
|
+
choices=["cuda"],
|
246
|
+
help="The device type.",
|
247
|
+
)
|
240
248
|
parser.add_argument(
|
241
249
|
"--kv-cache-dtype",
|
242
250
|
type=str,
|
@@ -566,7 +574,7 @@ class ServerArgs:
|
|
566
574
|
self.tp_size % self.nnodes == 0
|
567
575
|
), "tp_size must be divisible by number of nodes"
|
568
576
|
assert not (
|
569
|
-
self.dp_size > 1 and self.
|
577
|
+
self.dp_size > 1 and self.nnodes != 1
|
570
578
|
), "multi-node data parallel is not supported"
|
571
579
|
assert (
|
572
580
|
self.max_loras_per_batch > 0
|
@@ -575,11 +583,6 @@ class ServerArgs:
|
|
575
583
|
and (self.lora_paths is None or self.disable_radix_cache)
|
576
584
|
), "compatibility of lora and cuda graph and radix attention is in progress"
|
577
585
|
|
578
|
-
assert self.dp_size == 1, (
|
579
|
-
"The support for data parallelism is temporarily disabled during refactor. "
|
580
|
-
"Please use sglang<=0.3.2 or wait for later updates."
|
581
|
-
)
|
582
|
-
|
583
586
|
if isinstance(self.lora_paths, list):
|
584
587
|
lora_paths = self.lora_paths
|
585
588
|
self.lora_paths = {}
|
@@ -618,11 +621,11 @@ class PortArgs:
|
|
618
621
|
# The ipc filename for detokenizer to receive inputs from scheduler (zmq)
|
619
622
|
detokenizer_ipc_name: str
|
620
623
|
|
621
|
-
# The port for nccl initialization
|
622
|
-
|
624
|
+
# The port for nccl initialization (torch.dist)
|
625
|
+
nccl_port: int
|
623
626
|
|
624
|
-
@
|
625
|
-
def init_new(
|
627
|
+
@staticmethod
|
628
|
+
def init_new(server_args) -> "PortArgs":
|
626
629
|
port = server_args.port + 1
|
627
630
|
while True:
|
628
631
|
if is_port_available(port):
|
@@ -633,7 +636,7 @@ class PortArgs:
|
|
633
636
|
tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
|
634
637
|
scheduler_input_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
|
635
638
|
detokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
|
636
|
-
|
639
|
+
nccl_port=port,
|
637
640
|
)
|
638
641
|
|
639
642
|
|
sglang/srt/utils.py
CHANGED
@@ -140,26 +140,41 @@ def calculate_time(show=False, min_cost_ms=0.0):
|
|
140
140
|
return wrapper
|
141
141
|
|
142
142
|
|
143
|
-
def get_available_gpu_memory(gpu_id, distributed=False):
|
143
|
+
def get_available_gpu_memory(device, gpu_id, distributed=False):
|
144
144
|
"""
|
145
145
|
Get available memory for cuda:gpu_id device.
|
146
146
|
When distributed is True, the available memory is the minimum available memory of all GPUs.
|
147
147
|
"""
|
148
|
-
|
149
|
-
|
148
|
+
if device == "cuda":
|
149
|
+
num_gpus = torch.cuda.device_count()
|
150
|
+
assert gpu_id < num_gpus
|
151
|
+
|
152
|
+
if torch.cuda.current_device() != gpu_id:
|
153
|
+
print(
|
154
|
+
f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
|
155
|
+
"which may cause useless memory allocation for torch CUDA context.",
|
156
|
+
)
|
150
157
|
|
151
|
-
|
152
|
-
|
153
|
-
f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
|
154
|
-
"which may cause useless memory allocation for torch CUDA context.",
|
155
|
-
)
|
158
|
+
torch.cuda.empty_cache()
|
159
|
+
free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
|
156
160
|
|
157
|
-
|
158
|
-
|
161
|
+
elif device == "xpu":
|
162
|
+
num_gpus = torch.xpu.device_count()
|
163
|
+
assert gpu_id < num_gpus
|
164
|
+
|
165
|
+
if torch.xpu.current_device() != gpu_id:
|
166
|
+
print(
|
167
|
+
f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
|
168
|
+
"which may cause useless memory allocation for torch XPU context.",
|
169
|
+
)
|
170
|
+
torch.xpu.empty_cache()
|
171
|
+
used_memory = torch.xpu.memory_allocated()
|
172
|
+
total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
|
173
|
+
free_gpu_memory = total_gpu_memory - used_memory
|
159
174
|
|
160
175
|
if distributed:
|
161
176
|
tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
|
162
|
-
torch.device(
|
177
|
+
torch.device(device, gpu_id)
|
163
178
|
)
|
164
179
|
torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
|
165
180
|
free_gpu_memory = tensor.item()
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.3"
|
1
|
+
__version__ = "0.3.3.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.3
|
3
|
+
Version: 0.3.3.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -257,8 +257,8 @@ Requires-Dist: sentence-transformers; extra == "test"
|
|
257
257
|
Requires-Dist: accelerate; extra == "test"
|
258
258
|
Requires-Dist: peft; extra == "test"
|
259
259
|
|
260
|
-
<div align="center">
|
261
|
-
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
260
|
+
<div align="center" id="sglangtop">
|
261
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
262
262
|
|
263
263
|
[](https://pypi.org/project/sglang)
|
264
264
|

|
@@ -270,10 +270,9 @@ Requires-Dist: peft; extra == "test"
|
|
270
270
|
|
271
271
|
--------------------------------------------------------------------------------
|
272
272
|
|
273
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
273
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
274
274
|
|
275
275
|
## Upcoming Events
|
276
|
-
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
277
276
|
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
278
277
|
|
279
278
|
## News
|
@@ -324,7 +323,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
324
323
|
### Method 2: From source
|
325
324
|
```
|
326
325
|
# Use the last release branch
|
327
|
-
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
326
|
+
git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
|
328
327
|
cd sglang
|
329
328
|
|
330
329
|
pip install --upgrade pip
|
@@ -848,3 +847,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
|
848
847
|
## Citation And Acknowledgment
|
849
848
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
850
849
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
850
|
+
|
851
|
+
|
852
|
+
|
853
|
+
<p align="center">
|
854
|
+
<a href="#sglangtop" target="_blank">
|
855
|
+
<bold>Back To Top </bold>
|
856
|
+
</a>
|
857
|
+
</p>
|
@@ -1,6 +1,6 @@
|
|
1
1
|
sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
|
2
2
|
sglang/api.py,sha256=5x591S4rLbmNPs75qPwGKVu1sonVGDyjPAJlHTyWw50,6956
|
3
|
-
sglang/bench_latency.py,sha256=
|
3
|
+
sglang/bench_latency.py,sha256=gCS_nPNCf3sYR83jg6_KNadm-Xy7-I1V-UdZaoKFi8M,17889
|
4
4
|
sglang/bench_server_latency.py,sha256=rRSDqjJ5jan9AzppOGx75KRUjZCU2dUG2h06CQOdJgk,5377
|
5
5
|
sglang/bench_serving.py,sha256=1AQzkQ8ci9-rMZEM7wap8I09oPP4AZd93RfXMQRgVro,36386
|
6
6
|
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
@@ -8,7 +8,7 @@ sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
|
|
8
8
|
sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
|
9
9
|
sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
|
10
10
|
sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
|
11
|
-
sglang/version.py,sha256=
|
11
|
+
sglang/version.py,sha256=7Z8nSxbc04sgIKYqfKxkmSnG2nnSPT9dpM3RYiFOpUc,28
|
12
12
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
14
14
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -23,12 +23,12 @@ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThc
|
|
23
23
|
sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
|
24
24
|
sglang/lang/backend/runtime_endpoint.py,sha256=iVb7SlrpJ1ic92QG5kQUphZUb2EaVWY43dkmAO5pju4,10514
|
25
25
|
sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
|
26
|
-
sglang/srt/conversation.py,sha256=
|
26
|
+
sglang/srt/conversation.py,sha256=B4QPGOUVdoXRJwWWxSm5pfifGpuBs07fDTxJ1BHUXLw,20003
|
27
27
|
sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
|
28
28
|
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
29
|
-
sglang/srt/server.py,sha256=
|
30
|
-
sglang/srt/server_args.py,sha256=
|
31
|
-
sglang/srt/utils.py,sha256=
|
29
|
+
sglang/srt/server.py,sha256=hb27kEsOzQeVy5HuMVRJNOG8OsFwq0KZBVsZXB2qN1U,23267
|
30
|
+
sglang/srt/server_args.py,sha256=_Y7YLlGYOWpR1mtfN8Hmi6hsldkp4q8vLhcNatHhsuQ,24374
|
31
|
+
sglang/srt/utils.py,sha256=PYApZ2rFU67TnJaKkkjF9Z93jBPCJkotB0kk1vHGI6Y,22858
|
32
32
|
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
33
33
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
34
34
|
sglang/srt/configs/model_config.py,sha256=36My-o44trhWY3KYDeSFMGvv9XuUtIVI5e7F8VlOTWo,6723
|
@@ -60,13 +60,14 @@ sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bV
|
|
60
60
|
sglang/srt/lora/lora.py,sha256=a5j_Yy0s95msVPFgOuH5PCe7sMu0AyZFQ5wL0H-YIg8,14913
|
61
61
|
sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
|
62
62
|
sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
|
63
|
+
sglang/srt/managers/data_parallel_controller.py,sha256=GJGfX1-5DoQFZ-EMh_p02nvrOtrOc0UebnULWHhFrss,5765
|
63
64
|
sglang/srt/managers/detokenizer_manager.py,sha256=iCLPdHkL6lAp_-Qew1u4Tyt3jYRkJ8i-Bj3l8TC-uaA,7278
|
64
65
|
sglang/srt/managers/image_processor.py,sha256=9Y9RqyLdbt4uOK7pnJCJIhY77791klskSrEg8U6pyS4,6910
|
65
|
-
sglang/srt/managers/io_struct.py,sha256=
|
66
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
66
|
+
sglang/srt/managers/io_struct.py,sha256=PxeLOgRJR5raUXExmQHWAnvJZLU0BA_e591zthEOYAQ,12185
|
67
|
+
sglang/srt/managers/schedule_batch.py,sha256=rev0x6tp2ex8uW4PPrcfJ6m6WgfhsNLpuPNWForYcGE,31363
|
67
68
|
sglang/srt/managers/schedule_policy.py,sha256=PiTKvsAFwoNWNsv_SFkghIHCL452MdboRc2cmN6ITcU,11935
|
68
|
-
sglang/srt/managers/scheduler.py,sha256=
|
69
|
-
sglang/srt/managers/tokenizer_manager.py,sha256=
|
69
|
+
sglang/srt/managers/scheduler.py,sha256=3eJjVZgLDyZWjniQf1Mkv1-1rbQyaOjyM0cRG-bNHAw,40625
|
70
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=AI1yfV5A1cpWDDuoelRx55lKMgNWccNOvyDFlKLddFA,24794
|
70
71
|
sglang/srt/managers/tp_worker.py,sha256=fcaW-u7AAX49kQCNn_AEtdRPykRdT6Z6lx1O9LHA15E,4833
|
71
72
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
72
73
|
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
@@ -75,7 +76,7 @@ sglang/srt/mem_cache/memory_pool.py,sha256=L-5drUt7vlyvple4OcjH1jJRzt2qhVrpc9klZ
|
|
75
76
|
sglang/srt/mem_cache/radix_cache.py,sha256=00bghOihUm7lA1i4gxxMYQLept9LaHg2ZSXZryuFZZI,10121
|
76
77
|
sglang/srt/model_executor/cuda_graph_runner.py,sha256=iheZYErwFT_W4kJUE1dgbGoQQx7hyOSKa-Yv8guq0DI,10479
|
77
78
|
sglang/srt/model_executor/forward_batch_info.py,sha256=FIQ8XIIP724mIL2l7w7mSEFH452qw-TPpqm43J4YeHM,5822
|
78
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
79
|
+
sglang/srt/model_executor/model_runner.py,sha256=Qxp6VyL-yiDzaTQuAQPTRrHiqsZAT0ki94teubxbocc,23237
|
79
80
|
sglang/srt/models/baichuan.py,sha256=50m43kIVo-YamHFwxyiLGG_pCbF7mzUJfhEyuuSmVC8,15100
|
80
81
|
sglang/srt/models/chatglm.py,sha256=XaS_6-ZvRw7X-56sk9xQogqT0NzGEMVpiAdQnC5qbBY,13333
|
81
82
|
sglang/srt/models/commandr.py,sha256=2urK7u2FiwPBl60hMmt-wfaJ8V-ilv6l1B37MUlvSxk,14121
|
@@ -108,9 +109,9 @@ sglang/srt/models/torch_native_llama.py,sha256=c5GJ_k9zbSOk0PjLCXAK8YebGEy0RUVYZ
|
|
108
109
|
sglang/srt/models/xverse.py,sha256=i11wEKqqVCoVtH7yo9jfpNyGHxhw7NvTPid3ojmg79s,13634
|
109
110
|
sglang/srt/models/xverse_moe.py,sha256=JwkBhsyusP7e_hAMnomkP8cEmKNCLJPRtwaTERQ0D0M,15818
|
110
111
|
sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
|
111
|
-
sglang/srt/openai_api/adapter.py,sha256=
|
112
|
+
sglang/srt/openai_api/adapter.py,sha256=bQ2lZGEQGAUkITXshdnCPzx6JN9iqYVvIpfD7uO5rN4,51519
|
112
113
|
sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
|
113
|
-
sglang/srt/sampling/sampling_batch_info.py,sha256=
|
114
|
+
sglang/srt/sampling/sampling_batch_info.py,sha256=ec5TMw47q2OCrkp2QwN45Ss1RZ-QYv7-KuGFKyGuvsg,6686
|
114
115
|
sglang/srt/sampling/sampling_params.py,sha256=Xwh4_M6PP4SWyGV-zNyIhp4XbRKbeU4251ao8UOlZlI,5704
|
115
116
|
sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
|
116
117
|
sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
|
@@ -132,8 +133,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
|
|
132
133
|
sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
|
133
134
|
sglang/test/test_utils.py,sha256=NkJuezjmonjgC3_i_CTBd8KSqWh6W9CLcgoaqvTNK2U,18684
|
134
135
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
135
|
-
sglang-0.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
136
|
-
sglang-0.3.3.dist-info/METADATA,sha256=
|
137
|
-
sglang-0.3.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
138
|
-
sglang-0.3.3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
139
|
-
sglang-0.3.3.dist-info/RECORD,,
|
136
|
+
sglang-0.3.3.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
137
|
+
sglang-0.3.3.post1.dist-info/METADATA,sha256=xfzfAtRkt_PcB8Lw34-Jckq-iukmhDnhu-_8e9SZ3_Y,39186
|
138
|
+
sglang-0.3.3.post1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
139
|
+
sglang-0.3.3.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
140
|
+
sglang-0.3.3.post1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|