sglang 0.3.3__py3-none-any.whl → 0.3.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sglang/bench_latency.py CHANGED
@@ -139,7 +139,7 @@ def load_model(server_args, port_args, tp_rank):
139
139
  gpu_id=tp_rank,
140
140
  tp_rank=tp_rank,
141
141
  tp_size=server_args.tp_size,
142
- nccl_port=port_args.nccl_ports[0],
142
+ nccl_port=port_args.nccl_port,
143
143
  server_args=server_args,
144
144
  )
145
145
  rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
@@ -220,6 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220
220
  return reqs
221
221
 
222
222
 
223
+ @torch.inference_mode()
223
224
  def extend(reqs, model_runner):
224
225
  batch = ScheduleBatch.init_new(
225
226
  reqs=reqs,
@@ -235,6 +236,7 @@ def extend(reqs, model_runner):
235
236
  return next_token_ids, logits_output.next_token_logits, batch
236
237
 
237
238
 
239
+ @torch.inference_mode()
238
240
  def decode(input_token_ids, batch, model_runner):
239
241
  batch.prepare_for_decode(input_token_ids)
240
242
  model_worker_batch = batch.get_model_worker_batch()
@@ -244,7 +246,6 @@ def decode(input_token_ids, batch, model_runner):
244
246
  return next_token_ids, logits_output.next_token_logits
245
247
 
246
248
 
247
- @torch.inference_mode()
248
249
  def correctness_test(
249
250
  server_args,
250
251
  port_args,
@@ -287,7 +288,6 @@ def correctness_test(
287
288
  rank_print(tokenizer.decode(output_ids[i]), "\n")
288
289
 
289
290
 
290
- @torch.inference_mode()
291
291
  def latency_test_run_once(
292
292
  run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
293
293
  ):
@@ -70,6 +70,9 @@ class Conversation:
70
70
  sep2: str = None
71
71
  # Stop criteria (the default one is EOS token)
72
72
  stop_str: Union[str, List[str]] = None
73
+ # The string that represents an image token in the prompt
74
+ image_token: str = "<image>"
75
+
73
76
  image_data: Optional[List[str]] = None
74
77
  modalities: Optional[List[str]] = None
75
78
 
@@ -334,6 +337,7 @@ class Conversation:
334
337
  sep=self.sep,
335
338
  sep2=self.sep2,
336
339
  stop_str=self.stop_str,
340
+ image_token=self.image_token,
337
341
  )
338
342
 
339
343
  def dict(self):
@@ -381,6 +385,7 @@ def generate_chat_conv(
381
385
  stop_str=conv.stop_str,
382
386
  image_data=[],
383
387
  modalities=[],
388
+ image_token=conv.image_token,
384
389
  )
385
390
 
386
391
  if isinstance(request.messages, str):
@@ -412,9 +417,13 @@ def generate_chat_conv(
412
417
  num_image_url += 1
413
418
  conv.modalities.append(content.modalities)
414
419
  if num_image_url > 1:
415
- image_token = "<image>"
420
+ image_token = conv.image_token
416
421
  else:
417
- image_token = "<image>\n"
422
+ image_token = (
423
+ conv.image_token + "\n"
424
+ if conv.name != "qwen2-vl"
425
+ else conv.image_token
426
+ )
418
427
  for content in message.content:
419
428
  if content.type == "text":
420
429
  if num_image_url > 16:
@@ -0,0 +1,177 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
16
+ """A controller that dispatches requests to multiple data parallel workers."""
17
+
18
+ import logging
19
+ import multiprocessing as mp
20
+ from enum import Enum, auto
21
+
22
+ import zmq
23
+
24
+ from sglang.srt.managers.io_struct import (
25
+ TokenizedEmbeddingReqInput,
26
+ TokenizedGenerateReqInput,
27
+ TokenizedRewardReqInput,
28
+ )
29
+ from sglang.srt.managers.scheduler import run_scheduler_process
30
+ from sglang.srt.server_args import PortArgs, ServerArgs
31
+ from sglang.srt.utils import (
32
+ configure_logger,
33
+ kill_parent_process,
34
+ suppress_other_loggers,
35
+ )
36
+ from sglang.utils import get_exception_traceback
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ class LoadBalanceMethod(Enum):
42
+ """Load balance method."""
43
+
44
+ ROUND_ROBIN = auto()
45
+ SHORTEST_QUEUE = auto()
46
+
47
+ @classmethod
48
+ def from_str(cls, method: str):
49
+ method = method.upper()
50
+ try:
51
+ return cls[method]
52
+ except KeyError as exc:
53
+ raise ValueError(f"Invalid load balance method: {method}") from exc
54
+
55
+
56
+ class DataParallelController:
57
+ """A controller that dispatches requests to multiple data parallel workers."""
58
+
59
+ def __init__(self, server_args, port_args) -> None:
60
+ # Parse args
61
+ self.server_args = server_args
62
+ self.port_args = port_args
63
+ self.load_balance_method = LoadBalanceMethod.from_str(
64
+ server_args.load_balance_method
65
+ )
66
+
67
+ # Init inter-process communication
68
+ self.context = zmq.Context(1 + server_args.dp_size)
69
+ self.recv_from_tokenizer = self.context.socket(zmq.PULL)
70
+ self.recv_from_tokenizer.bind(f"ipc://{port_args.scheduler_input_ipc_name}")
71
+
72
+ # Dispatch method
73
+ self.round_robin_counter = 0
74
+ dispatch_lookup = {
75
+ LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
76
+ LoadBalanceMethod.SHORTEST_QUEUE: self.shortest_queue_scheduler,
77
+ }
78
+ self.dispatching = dispatch_lookup[self.load_balance_method]
79
+
80
+ # Start data parallel workers
81
+ base_gpu_id = 0
82
+ self.workers = []
83
+ for dp_rank in range(server_args.dp_size):
84
+ tmp_port_args = PortArgs.init_new(server_args)
85
+ tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
86
+
87
+ send_to = self.launch_tensor_parallel_group(
88
+ server_args,
89
+ tmp_port_args,
90
+ base_gpu_id,
91
+ dp_rank,
92
+ )
93
+
94
+ self.workers.append(send_to)
95
+ base_gpu_id += server_args.tp_size
96
+
97
+ def launch_tensor_parallel_group(
98
+ self,
99
+ server_args: ServerArgs,
100
+ port_args: PortArgs,
101
+ base_gpu_id: int,
102
+ dp_rank: int,
103
+ ):
104
+ # Launch tensor parallel scheduler processes
105
+ scheduler_procs = []
106
+ scheduler_pipe_readers = []
107
+ tp_size_per_node = server_args.tp_size // server_args.nnodes
108
+ tp_rank_range = range(
109
+ tp_size_per_node * server_args.node_rank,
110
+ tp_size_per_node * (server_args.node_rank + 1),
111
+ )
112
+ for tp_rank in tp_rank_range:
113
+ reader, writer = mp.Pipe(duplex=False)
114
+ gpu_id = base_gpu_id + tp_rank % tp_size_per_node
115
+ proc = mp.Process(
116
+ target=run_scheduler_process,
117
+ args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
118
+ )
119
+ proc.start()
120
+ scheduler_procs.append(proc)
121
+ scheduler_pipe_readers.append(reader)
122
+
123
+ send_to = self.context.socket(zmq.PUSH)
124
+ send_to.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
125
+
126
+ # Wait for model to finish loading
127
+ for i in range(len(scheduler_pipe_readers)):
128
+ scheduler_pipe_readers[i].recv()
129
+
130
+ return send_to
131
+
132
+ def round_robin_scheduler(self, req):
133
+ self.workers[self.round_robin_counter].send_pyobj(req)
134
+ self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
135
+
136
+ def shortest_queue_scheduler(self, input_requests):
137
+ raise NotImplementedError()
138
+
139
+ def event_loop(self):
140
+ while True:
141
+ while True:
142
+ try:
143
+ recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
144
+ except zmq.ZMQError:
145
+ break
146
+
147
+ if isinstance(
148
+ recv_req,
149
+ (
150
+ TokenizedGenerateReqInput,
151
+ TokenizedEmbeddingReqInput,
152
+ TokenizedRewardReqInput,
153
+ ),
154
+ ):
155
+ self.dispatching(recv_req)
156
+ else:
157
+ # Send other control messages to all workers
158
+ for worker in self.workers:
159
+ worker.queue.put(recv_req)
160
+
161
+
162
+ def run_data_parallel_controller_process(
163
+ server_args: ServerArgs,
164
+ port_args: PortArgs,
165
+ pipe_writer,
166
+ ):
167
+ configure_logger(server_args)
168
+ suppress_other_loggers()
169
+
170
+ try:
171
+ controller = DataParallelController(server_args, port_args)
172
+ pipe_writer.send("ready")
173
+ controller.event_loop()
174
+ except Exception:
175
+ msg = get_exception_traceback()
176
+ logger.error(msg)
177
+ kill_parent_process()
@@ -20,6 +20,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
20
20
 
21
21
  import uuid
22
22
  from dataclasses import dataclass
23
+ from enum import Enum
23
24
  from typing import Dict, List, Optional, Union
24
25
 
25
26
  from sglang.srt.managers.schedule_batch import BaseFinishReason
@@ -119,8 +120,7 @@ class GenerateReqInput:
119
120
  elif not isinstance(self.image_data, list):
120
121
  self.image_data = [self.image_data] * num
121
122
  elif isinstance(self.image_data, list):
122
- # FIXME incorrect order for duplication
123
- self.image_data = self.image_data * num
123
+ pass
124
124
 
125
125
  if self.sampling_params is None:
126
126
  self.sampling_params = [{}] * num
@@ -344,3 +344,8 @@ class UpdateWeightReqOutput:
344
344
  class AbortReq:
345
345
  # The request id
346
346
  rid: str
347
+
348
+
349
+ class ProfileReq(Enum):
350
+ START_PROFILE = 1
351
+ STOP_PROFILE = 2
@@ -423,6 +423,9 @@ class ScheduleBatch:
423
423
  # Stream
424
424
  has_stream: bool = False
425
425
 
426
+ # device
427
+ device: str = "cuda"
428
+
426
429
  # Has regex
427
430
  has_regex: bool = False
428
431
 
@@ -439,6 +442,7 @@ class ScheduleBatch:
439
442
  tree_cache=tree_cache,
440
443
  return_logprob=return_logprob,
441
444
  has_stream=has_stream,
445
+ device=req_to_token_pool.device,
442
446
  has_regex=has_regex,
443
447
  )
444
448
 
@@ -806,6 +810,8 @@ class ScheduleBatch:
806
810
  self.sampling_info.regex_fsm_states = [
807
811
  req.regex_fsm_state for req in self.reqs
808
812
  ]
813
+ else:
814
+ self.sampling_info.regex_fsms = None
809
815
 
810
816
  return ModelWorkerBatch(
811
817
  forward_mode=self.forward_mode,
@@ -37,6 +37,7 @@ from sglang.srt.managers.io_struct import (
37
37
  BatchEmbeddingOut,
38
38
  BatchTokenIDOut,
39
39
  FlushCacheReq,
40
+ ProfileReq,
40
41
  TokenizedEmbeddingReqInput,
41
42
  TokenizedGenerateReqInput,
42
43
  TokenizedRewardReqInput,
@@ -141,7 +142,7 @@ class Scheduler:
141
142
  gpu_id=gpu_id,
142
143
  tp_rank=tp_rank,
143
144
  server_args=server_args,
144
- nccl_port=port_args.nccl_ports[0],
145
+ nccl_port=port_args.nccl_port,
145
146
  )
146
147
  self.tp_cpu_group = self.tp_worker.model_runner.tp_group.cpu_group
147
148
 
@@ -229,6 +230,22 @@ class Scheduler:
229
230
  self.new_token_ratio_decay = global_config.new_token_ratio_decay
230
231
  self.batch_is_full = False
231
232
 
233
+ if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
234
+ self.profiler = None
235
+ else:
236
+ self.torch_profiler_trace_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR")
237
+ logger.info(
238
+ "Profiling enabled. Traces will be saved to: %s",
239
+ self.torch_profiler_trace_dir,
240
+ )
241
+ self.profiler = torch.profiler.profile(
242
+ activities=[
243
+ torch.profiler.ProfilerActivity.CPU,
244
+ torch.profiler.ProfilerActivity.CUDA,
245
+ ],
246
+ with_stack=True,
247
+ )
248
+
232
249
  @torch.inference_mode()
233
250
  def event_loop(self):
234
251
  while True:
@@ -271,6 +288,11 @@ class Scheduler:
271
288
  elif isinstance(recv_req, UpdateWeightReqInput):
272
289
  success, message = self.update_weights(recv_req)
273
290
  self.out_pyobjs.append(UpdateWeightReqOutput(success, message))
291
+ elif isinstance(recv_req, ProfileReq):
292
+ if recv_req == ProfileReq.START_PROFILE:
293
+ self.start_profile()
294
+ else:
295
+ self.stop_profile()
274
296
  else:
275
297
  raise ValueError(f"Invalid request: {recv_req}")
276
298
 
@@ -433,6 +455,9 @@ class Scheduler:
433
455
  result = self.run_batch(batch)
434
456
  self.process_batch_result(batch, result)
435
457
 
458
+ if self.running_batch.is_empty():
459
+ self.running_batch = None
460
+
436
461
  if self.running_batch is None:
437
462
  break
438
463
 
@@ -772,9 +797,6 @@ class Scheduler:
772
797
  if self.tp_rank == 0 and self.decode_forward_ct % 40 == 0:
773
798
  self.print_decode_stats()
774
799
 
775
- if self.running_batch.is_empty():
776
- self.running_batch = None
777
-
778
800
  def add_logprob_return_values(
779
801
  self,
780
802
  i: int,
@@ -1000,15 +1022,34 @@ class Scheduler:
1000
1022
  logger.error(message)
1001
1023
  return success, message
1002
1024
 
1025
+ def start_profile(self) -> None:
1026
+ if self.profiler is None:
1027
+ raise RuntimeError("Profiler is not enabled.")
1028
+ self.profiler.start()
1029
+
1030
+ def stop_profile(self) -> None:
1031
+ if self.profiler is None:
1032
+ raise RuntimeError("Profiler is not enabled.")
1033
+ self.profiler.stop()
1034
+ self.profiler.export_chrome_trace(
1035
+ self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
1036
+ )
1037
+ logger.info("Profiler is done")
1038
+
1003
1039
 
1004
1040
  def run_scheduler_process(
1005
1041
  server_args: ServerArgs,
1006
1042
  port_args: PortArgs,
1007
1043
  gpu_id: int,
1008
1044
  tp_rank: int,
1045
+ dp_rank: Optional[int],
1009
1046
  pipe_writer,
1010
1047
  ):
1011
- configure_logger(server_args, prefix=f" TP{tp_rank}")
1048
+ if dp_rank is None:
1049
+ configure_logger(server_args, prefix=f" TP{tp_rank}")
1050
+ else:
1051
+ configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
1052
+
1012
1053
  suppress_other_loggers()
1013
1054
 
1014
1055
  try:
@@ -46,6 +46,7 @@ from sglang.srt.managers.io_struct import (
46
46
  EmbeddingReqInput,
47
47
  FlushCacheReq,
48
48
  GenerateReqInput,
49
+ ProfileReq,
49
50
  RewardReqInput,
50
51
  TokenizedEmbeddingReqInput,
51
52
  TokenizedGenerateReqInput,
@@ -512,6 +513,14 @@ class TokenizerManager:
512
513
  req = AbortReq(rid)
513
514
  self.send_to_scheduler.send_pyobj(req)
514
515
 
516
+ def start_profile(self):
517
+ req = ProfileReq.START_PROFILE
518
+ self.send_to_scheduler.send_pyobj(req)
519
+
520
+ def stop_profile(self):
521
+ req = ProfileReq.STOP_PROFILE
522
+ self.send_to_scheduler.send_pyobj(req)
523
+
515
524
  async def update_weights(
516
525
  self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
517
526
  ):
@@ -81,10 +81,11 @@ class ModelRunner:
81
81
  # Parse args
82
82
  self.model_config = model_config
83
83
  self.mem_fraction_static = mem_fraction_static
84
+ self.device = server_args.device
84
85
  self.gpu_id = gpu_id
85
86
  self.tp_rank = tp_rank
86
87
  self.tp_size = tp_size
87
- self.nccl_port = nccl_port
88
+ self.dist_port = nccl_port
88
89
  self.server_args = server_args
89
90
  self.is_multimodal_model = is_multimodal_model(
90
91
  self.model_config.hf_config.architectures
@@ -95,7 +96,7 @@ class ModelRunner:
95
96
  self.model_config.attention_arch == AttentionArch.MLA
96
97
  and not self.server_args.disable_mla
97
98
  ):
98
- logger.info("MLA optimization is tunred on. Use triton backend.")
99
+ logger.info("MLA optimization is turned on. Use triton backend.")
99
100
  self.server_args.attention_backend = "triton"
100
101
 
101
102
  if self.is_multimodal_model:
@@ -132,39 +133,45 @@ class ModelRunner:
132
133
  server_args.max_running_requests,
133
134
  server_args.max_total_tokens,
134
135
  )
135
- self.init_cublas()
136
- self.init_attention_backend()
137
- self.init_cuda_graphs()
136
+ if self.device == "cuda":
137
+ self.init_cublas()
138
+ self.init_attention_backend()
139
+ self.init_cuda_graphs()
140
+ else:
141
+ self.init_attention_backend()
138
142
 
139
143
  def init_torch_distributed(self):
144
+ logger.info("Init torch distributed begin.")
140
145
  # Init torch distributed
141
- torch.cuda.set_device(self.gpu_id)
142
- logger.info("Init nccl begin.")
146
+ if self.device == "cuda":
147
+ torch.cuda.set_device(self.gpu_id)
148
+ backend = "nccl"
143
149
 
144
150
  if not self.server_args.enable_p2p_check:
145
151
  monkey_patch_vllm_p2p_access_check(self.gpu_id)
146
-
147
152
  if self.server_args.dist_init_addr:
148
- nccl_init_method = f"tcp://{self.server_args.dist_init_addr}"
153
+ dist_init_method = f"tcp://{self.server_args.dist_init_addr}"
149
154
  else:
150
- nccl_init_method = f"tcp://127.0.0.1:{self.nccl_port}"
155
+ dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
151
156
  set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
152
157
  init_distributed_environment(
153
- backend="nccl",
158
+ backend=backend,
154
159
  world_size=self.tp_size,
155
160
  rank=self.tp_rank,
156
161
  local_rank=self.gpu_id,
157
- distributed_init_method=nccl_init_method,
162
+ distributed_init_method=dist_init_method,
158
163
  )
159
164
  initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
160
165
  min_per_gpu_memory = get_available_gpu_memory(
161
- self.gpu_id, distributed=self.tp_size > 1
166
+ self.device, self.gpu_id, distributed=self.tp_size > 1
162
167
  )
163
168
  self.tp_group = get_tp_group()
164
169
 
165
170
  # Currently, there is a bug with mulit-node tensor parallelsim + padded cuda graph,
166
171
  # so we disable padding in cuda graph.
167
- if not all(in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)):
172
+ if self.device == "cuda" and not all(
173
+ in_the_same_node_as(self.tp_group.cpu_group, source_rank=0)
174
+ ):
168
175
  self.server_args.disable_cuda_graph_padding = True
169
176
  logger.info(
170
177
  "Setting disable_cuda_graph_padding to True because of multi-node tensor parallelism."
@@ -172,7 +179,7 @@ class ModelRunner:
172
179
 
173
180
  # Check memory for tensor parallelism
174
181
  if self.tp_size > 1:
175
- local_gpu_memory = get_available_gpu_memory(self.gpu_id)
182
+ local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
176
183
  if min_per_gpu_memory < local_gpu_memory * 0.9:
177
184
  raise ValueError(
178
185
  "The memory capacity is unbalanced. Some GPUs may be occupied by other processes."
@@ -182,23 +189,22 @@ class ModelRunner:
182
189
 
183
190
  def load_model(self):
184
191
  logger.info(
185
- f"Load weight begin. avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
192
+ f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
186
193
  )
187
194
 
188
195
  # This can reduce thread conflicts and speed up weight loading.
189
196
  torch.set_num_threads(1)
190
-
191
- if torch.cuda.get_device_capability()[0] < 8:
192
- logger.info(
193
- "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
194
- )
195
- self.server_args.dtype = "float16"
196
- if torch.cuda.get_device_capability()[1] < 5:
197
- raise RuntimeError("SGLang only supports sm75 and above.")
197
+ if self.device == "cuda":
198
+ if torch.cuda.get_device_capability()[0] < 8:
199
+ logger.info(
200
+ "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
201
+ )
202
+ self.server_args.dtype = "float16"
203
+ if torch.cuda.get_device_capability()[1] < 5:
204
+ raise RuntimeError("SGLang only supports sm75 and above.")
198
205
 
199
206
  # Prepare the vllm model config
200
207
  monkey_patch_vllm_dummy_weight_loader()
201
- self.device_config = DeviceConfig()
202
208
  self.load_config = LoadConfig(load_format=self.server_args.load_format)
203
209
  self.vllm_model_config = VllmModelConfig(
204
210
  model=self.server_args.model_path,
@@ -220,7 +226,7 @@ class ModelRunner:
220
226
  self.model = get_model(
221
227
  model_config=self.vllm_model_config,
222
228
  load_config=self.load_config,
223
- device_config=self.device_config,
229
+ device_config=DeviceConfig(self.device),
224
230
  parallel_config=None,
225
231
  scheduler_config=None,
226
232
  lora_config=None,
@@ -240,7 +246,7 @@ class ModelRunner:
240
246
  f"Load weight end. "
241
247
  f"type={type(self.model).__name__}, "
242
248
  f"dtype={self.dtype}, "
243
- f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
249
+ f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
244
250
  )
245
251
 
246
252
  def update_weights(self, model_path: str, load_format: str):
@@ -254,10 +260,10 @@ class ModelRunner:
254
260
 
255
261
  logger.info(
256
262
  f"Update weights begin. "
257
- f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
263
+ f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
258
264
  )
259
265
 
260
- target_device = torch.device(self.device_config.device)
266
+ target_device = torch.device(self.device)
261
267
 
262
268
  try:
263
269
  # TODO: Use a better method to check this
@@ -343,7 +349,7 @@ class ModelRunner:
343
349
 
344
350
  def profile_max_num_token(self, total_gpu_memory: int):
345
351
  available_gpu_memory = get_available_gpu_memory(
346
- self.gpu_id, distributed=self.tp_size > 1
352
+ self.device, self.gpu_id, distributed=self.tp_size > 1
347
353
  )
348
354
  if (
349
355
  self.model_config.attention_arch == AttentionArch.MLA
@@ -409,11 +415,10 @@ class ModelRunner:
409
415
  4096,
410
416
  )
411
417
 
412
- device = "cuda"
413
418
  self.req_to_token_pool = ReqToTokenPool(
414
419
  size=max_num_reqs + 1,
415
420
  max_context_len=self.model_config.context_len + 4,
416
- device=device,
421
+ device=self.device,
417
422
  )
418
423
  if (
419
424
  self.model_config.attention_arch == AttentionArch.MLA
@@ -425,7 +430,7 @@ class ModelRunner:
425
430
  kv_lora_rank=self.model_config.kv_lora_rank,
426
431
  qk_rope_head_dim=self.model_config.qk_rope_head_dim,
427
432
  layer_num=self.model_config.num_hidden_layers,
428
- device=device,
433
+ device=self.device,
429
434
  )
430
435
  else:
431
436
  self.token_to_kv_pool = MHATokenToKVPool(
@@ -434,11 +439,11 @@ class ModelRunner:
434
439
  head_num=self.model_config.get_num_kv_heads(self.tp_size),
435
440
  head_dim=self.model_config.head_dim,
436
441
  layer_num=self.model_config.num_hidden_layers,
437
- device=device,
442
+ device=self.device,
438
443
  )
439
444
  logger.info(
440
445
  f"Memory pool end. "
441
- f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB"
446
+ f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
442
447
  )
443
448
 
444
449
  def init_cublas(self):
@@ -117,7 +117,9 @@ def create_streaming_error_response(
117
117
  def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
118
118
  global chat_template_name
119
119
 
120
- logger.info(f"Use chat template: {chat_template_arg}")
120
+ logger.info(
121
+ f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}"
122
+ )
121
123
  if not chat_template_exists(chat_template_arg):
122
124
  if not os.path.exists(chat_template_arg):
123
125
  raise RuntimeError(
@@ -924,7 +926,7 @@ def v1_chat_generate_request(
924
926
  else:
925
927
  prompt_kwargs = {"input_ids": input_ids}
926
928
  sampling_params_list = sampling_params_list[0]
927
- image_data = image_data_list[0]
929
+ image_data_list = image_data_list[0]
928
930
  return_logprobs = return_logprobs[0]
929
931
  logprob_start_lens = logprob_start_lens[0]
930
932
  top_logprobs_nums = top_logprobs_nums[0]
@@ -937,7 +939,7 @@ def v1_chat_generate_request(
937
939
 
938
940
  adapted_request = GenerateReqInput(
939
941
  **prompt_kwargs,
940
- image_data=image_data,
942
+ image_data=image_data_list,
941
943
  sampling_params=sampling_params_list,
942
944
  return_logprob=return_logprobs,
943
945
  logprob_start_len=logprob_start_lens,
@@ -37,6 +37,9 @@ class SamplingBatchInfo:
37
37
  linear_penalties: torch.Tensor = None
38
38
  scaling_penalties: torch.Tensor = None
39
39
 
40
+ # Device
41
+ device: str = "cuda"
42
+
40
43
  @classmethod
41
44
  def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
42
45
  reqs = batch.reqs
@@ -62,6 +65,7 @@ class SamplingBatchInfo:
62
65
  min_ps=min_ps,
63
66
  need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
64
67
  vocab_size=vocab_size,
68
+ device=batch.input_ids.device,
65
69
  )
66
70
  # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
67
71
 
@@ -75,7 +79,7 @@ class SamplingBatchInfo:
75
79
  ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
76
80
  vocab_size=vocab_size,
77
81
  batch=batch,
78
- device="cuda",
82
+ device=batch.input_ids.device,
79
83
  Penalizers={
80
84
  penaltylib.BatchedFrequencyPenalizer,
81
85
  penaltylib.BatchedMinNewTokensPenalizer,
@@ -107,7 +111,7 @@ class SamplingBatchInfo:
107
111
  self.linear_penalties = torch.zeros(
108
112
  (bs, self.vocab_size),
109
113
  dtype=torch.float32,
110
- device="cuda",
114
+ device=self.device,
111
115
  )
112
116
  self.linear_penalties = penalizer.apply(self.linear_penalties)
113
117
 
@@ -119,7 +123,10 @@ class SamplingBatchInfo:
119
123
 
120
124
  if has_regex:
121
125
  self.vocab_mask = torch.zeros(
122
- len(self.temperatures), self.vocab_size, dtype=torch.bool, device="cuda"
126
+ len(self.temperatures),
127
+ self.vocab_size,
128
+ dtype=torch.bool,
129
+ device=self.device,
123
130
  )
124
131
  for i, regex_fsm in enumerate(self.regex_fsms):
125
132
  if regex_fsm is not None:
@@ -144,7 +151,12 @@ class SamplingBatchInfo:
144
151
 
145
152
  @staticmethod
146
153
  def merge_bias_tensor(
147
- lhs: torch.Tensor, rhs: torch.Tensor, bs1: int, bs2: int, default: int = 0
154
+ lhs: torch.Tensor,
155
+ rhs: torch.Tensor,
156
+ bs1: int,
157
+ bs2: int,
158
+ device: str,
159
+ default: int = 0,
148
160
  ):
149
161
  # bias tensor can be None
150
162
  if lhs is not None or rhs is not None:
@@ -155,9 +167,9 @@ class SamplingBatchInfo:
155
167
  shape, dtype = rhs.shape[1:], rhs.dtype
156
168
  with torch.dtype(dtype):
157
169
  if lhs is None:
158
- lhs = torch.empty((bs1, *shape), device="cuda").fill_(default)
170
+ lhs = torch.empty((bs1, *shape), device=device).fill_(default)
159
171
  if rhs is None:
160
- rhs = torch.empty((bs2, *shape), device="cuda").fill_(default)
172
+ rhs = torch.empty((bs2, *shape), device=device).fill_(default)
161
173
  return torch.cat([lhs, rhs])
162
174
 
163
175
  return None
@@ -176,5 +188,5 @@ class SamplingBatchInfo:
176
188
  setattr(self, item, torch.concat([self_val, other_val]))
177
189
 
178
190
  self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
179
- self.logit_bias, other.logit_bias, len(self), len(other)
191
+ self.logit_bias, other.logit_bias, len(self), len(other), self.device
180
192
  )
sglang/srt/server.py CHANGED
@@ -44,6 +44,9 @@ from fastapi.responses import JSONResponse, Response, StreamingResponse
44
44
 
45
45
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
46
46
  from sglang.srt.hf_transformers_utils import get_tokenizer
47
+ from sglang.srt.managers.data_parallel_controller import (
48
+ run_data_parallel_controller_process,
49
+ )
47
50
  from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
48
51
  from sglang.srt.managers.io_struct import (
49
52
  EmbeddingReqInput,
@@ -145,6 +148,28 @@ async def flush_cache():
145
148
  )
146
149
 
147
150
 
151
+ @app.get("/start_profile")
152
+ @app.post("/start_profile")
153
+ async def start_profile():
154
+ """Start profiling."""
155
+ tokenizer_manager.start_profile()
156
+ return Response(
157
+ content="Start profiling.\n",
158
+ status_code=200,
159
+ )
160
+
161
+
162
+ @app.get("/stop_profile")
163
+ @app.post("/stop_profile")
164
+ async def stop_profile():
165
+ """Stop profiling."""
166
+ tokenizer_manager.stop_profile()
167
+ return Response(
168
+ content="Stop profiling. This will take some time.\n",
169
+ status_code=200,
170
+ )
171
+
172
+
148
173
  @app.post("/update_weights")
149
174
  async def update_weights(obj: UpdateWeightReqInput, request: Request):
150
175
  """Update the weights inplace without re-launching the server."""
@@ -315,30 +340,40 @@ def launch_engine(
315
340
  server_args.model_path, server_args.tokenizer_path
316
341
  )
317
342
 
318
- # Launch tensor parallel scheduler processes
319
- scheduler_procs = []
320
- scheduler_pipe_readers = []
321
- tp_size_per_node = server_args.tp_size // server_args.nnodes
322
- tp_rank_range = range(
323
- tp_size_per_node * server_args.node_rank,
324
- tp_size_per_node * (server_args.node_rank + 1),
325
- )
326
- for tp_rank in tp_rank_range:
343
+ if server_args.dp_size == 1:
344
+ # Launch tensor parallel scheduler processes
345
+ scheduler_procs = []
346
+ scheduler_pipe_readers = []
347
+ tp_size_per_node = server_args.tp_size // server_args.nnodes
348
+ tp_rank_range = range(
349
+ tp_size_per_node * server_args.node_rank,
350
+ tp_size_per_node * (server_args.node_rank + 1),
351
+ )
352
+ for tp_rank in tp_rank_range:
353
+ reader, writer = mp.Pipe(duplex=False)
354
+ gpu_id = tp_rank % tp_size_per_node
355
+ proc = mp.Process(
356
+ target=run_scheduler_process,
357
+ args=(server_args, port_args, gpu_id, tp_rank, None, writer),
358
+ )
359
+ proc.start()
360
+ scheduler_procs.append(proc)
361
+ scheduler_pipe_readers.append(reader)
362
+
363
+ if server_args.node_rank >= 1:
364
+ # For other nodes, they do not need to run tokenizer or detokenizer,
365
+ # so they can just wait here.
366
+ while True:
367
+ pass
368
+ else:
369
+ # Launch the data parallel controller
327
370
  reader, writer = mp.Pipe(duplex=False)
328
- gpu_id = tp_rank % tp_size_per_node
371
+ scheduler_pipe_readers = [reader]
329
372
  proc = mp.Process(
330
- target=run_scheduler_process,
331
- args=(server_args, port_args, gpu_id, tp_rank, writer),
373
+ target=run_data_parallel_controller_process,
374
+ args=(server_args, port_args, writer),
332
375
  )
333
376
  proc.start()
334
- scheduler_procs.append(proc)
335
- scheduler_pipe_readers.append(reader)
336
-
337
- if server_args.node_rank >= 1:
338
- # For other nodes, they do not need to run tokenizer or detokenizer,
339
- # so they can just wait here.
340
- while True:
341
- pass
342
377
 
343
378
  # Launch detokenizer process
344
379
  detoken_proc = mp.Process(
sglang/srt/server_args.py CHANGED
@@ -36,6 +36,7 @@ class ServerArgs:
36
36
  skip_tokenizer_init: bool = False
37
37
  load_format: str = "auto"
38
38
  dtype: str = "auto"
39
+ device: str = "cuda"
39
40
  kv_cache_dtype: str = "auto"
40
41
  trust_remote_code: bool = True
41
42
  context_length: Optional[int] = None
@@ -237,6 +238,13 @@ class ServerArgs:
237
238
  '* "float" is shorthand for FP32 precision.\n'
238
239
  '* "float32" for FP32 precision.',
239
240
  )
241
+ parser.add_argument(
242
+ "--device",
243
+ type=str,
244
+ default="cuda",
245
+ choices=["cuda"],
246
+ help="The device type.",
247
+ )
240
248
  parser.add_argument(
241
249
  "--kv-cache-dtype",
242
250
  type=str,
@@ -566,7 +574,7 @@ class ServerArgs:
566
574
  self.tp_size % self.nnodes == 0
567
575
  ), "tp_size must be divisible by number of nodes"
568
576
  assert not (
569
- self.dp_size > 1 and self.node_rank is not None
577
+ self.dp_size > 1 and self.nnodes != 1
570
578
  ), "multi-node data parallel is not supported"
571
579
  assert (
572
580
  self.max_loras_per_batch > 0
@@ -575,11 +583,6 @@ class ServerArgs:
575
583
  and (self.lora_paths is None or self.disable_radix_cache)
576
584
  ), "compatibility of lora and cuda graph and radix attention is in progress"
577
585
 
578
- assert self.dp_size == 1, (
579
- "The support for data parallelism is temporarily disabled during refactor. "
580
- "Please use sglang<=0.3.2 or wait for later updates."
581
- )
582
-
583
586
  if isinstance(self.lora_paths, list):
584
587
  lora_paths = self.lora_paths
585
588
  self.lora_paths = {}
@@ -618,11 +621,11 @@ class PortArgs:
618
621
  # The ipc filename for detokenizer to receive inputs from scheduler (zmq)
619
622
  detokenizer_ipc_name: str
620
623
 
621
- # The port for nccl initialization for multiple TP groups (torch.dist)
622
- nccl_ports: List[int]
624
+ # The port for nccl initialization (torch.dist)
625
+ nccl_port: int
623
626
 
624
- @classmethod
625
- def init_new(self, server_args):
627
+ @staticmethod
628
+ def init_new(server_args) -> "PortArgs":
626
629
  port = server_args.port + 1
627
630
  while True:
628
631
  if is_port_available(port):
@@ -633,7 +636,7 @@ class PortArgs:
633
636
  tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
634
637
  scheduler_input_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
635
638
  detokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,
636
- nccl_ports=[port],
639
+ nccl_port=port,
637
640
  )
638
641
 
639
642
 
sglang/srt/utils.py CHANGED
@@ -140,26 +140,41 @@ def calculate_time(show=False, min_cost_ms=0.0):
140
140
  return wrapper
141
141
 
142
142
 
143
- def get_available_gpu_memory(gpu_id, distributed=False):
143
+ def get_available_gpu_memory(device, gpu_id, distributed=False):
144
144
  """
145
145
  Get available memory for cuda:gpu_id device.
146
146
  When distributed is True, the available memory is the minimum available memory of all GPUs.
147
147
  """
148
- num_gpus = torch.cuda.device_count()
149
- assert gpu_id < num_gpus
148
+ if device == "cuda":
149
+ num_gpus = torch.cuda.device_count()
150
+ assert gpu_id < num_gpus
151
+
152
+ if torch.cuda.current_device() != gpu_id:
153
+ print(
154
+ f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
155
+ "which may cause useless memory allocation for torch CUDA context.",
156
+ )
150
157
 
151
- if torch.cuda.current_device() != gpu_id:
152
- print(
153
- f"WARNING: current device is not {gpu_id}, but {torch.cuda.current_device()}, ",
154
- "which may cause useless memory allocation for torch CUDA context.",
155
- )
158
+ torch.cuda.empty_cache()
159
+ free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
156
160
 
157
- torch.cuda.empty_cache()
158
- free_gpu_memory, _ = torch.cuda.mem_get_info(gpu_id)
161
+ elif device == "xpu":
162
+ num_gpus = torch.xpu.device_count()
163
+ assert gpu_id < num_gpus
164
+
165
+ if torch.xpu.current_device() != gpu_id:
166
+ print(
167
+ f"WARNING: current device is not {gpu_id}, but {torch.xpu.current_device()}, ",
168
+ "which may cause useless memory allocation for torch XPU context.",
169
+ )
170
+ torch.xpu.empty_cache()
171
+ used_memory = torch.xpu.memory_allocated()
172
+ total_gpu_memory = torch.xpu.get_device_properties(gpu_id).total_memory
173
+ free_gpu_memory = total_gpu_memory - used_memory
159
174
 
160
175
  if distributed:
161
176
  tensor = torch.tensor(free_gpu_memory, dtype=torch.float32).to(
162
- torch.device("cuda", gpu_id)
177
+ torch.device(device, gpu_id)
163
178
  )
164
179
  torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.MIN)
165
180
  free_gpu_memory = tensor.item()
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.3"
1
+ __version__ = "0.3.3.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.3
3
+ Version: 0.3.3.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -257,8 +257,8 @@ Requires-Dist: sentence-transformers; extra == "test"
257
257
  Requires-Dist: accelerate; extra == "test"
258
258
  Requires-Dist: peft; extra == "test"
259
259
 
260
- <div align="center">
261
- <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
260
+ <div align="center" id="sglangtop">
261
+ <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
262
262
 
263
263
  [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
264
264
  ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
@@ -270,10 +270,9 @@ Requires-Dist: peft; extra == "test"
270
270
 
271
271
  --------------------------------------------------------------------------------
272
272
 
273
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
273
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
274
274
 
275
275
  ## Upcoming Events
276
- - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
277
276
  - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
278
277
 
279
278
  ## News
@@ -324,7 +323,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
324
323
  ### Method 2: From source
325
324
  ```
326
325
  # Use the last release branch
327
- git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
326
+ git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
328
327
  cd sglang
329
328
 
330
329
  pip install --upgrade pip
@@ -848,3 +847,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
848
847
  ## Citation And Acknowledgment
849
848
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
850
849
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
850
+
851
+
852
+
853
+ <p align="center">
854
+ <a href="#sglangtop" target="_blank">
855
+ <bold>Back To Top </bold>
856
+ </a>
857
+ </p>
@@ -1,6 +1,6 @@
1
1
  sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
2
2
  sglang/api.py,sha256=5x591S4rLbmNPs75qPwGKVu1sonVGDyjPAJlHTyWw50,6956
3
- sglang/bench_latency.py,sha256=NkaL4YFWqDnochwaLd8o2pyZGqu6TeURbFB3TGyZHr4,17893
3
+ sglang/bench_latency.py,sha256=gCS_nPNCf3sYR83jg6_KNadm-Xy7-I1V-UdZaoKFi8M,17889
4
4
  sglang/bench_server_latency.py,sha256=rRSDqjJ5jan9AzppOGx75KRUjZCU2dUG2h06CQOdJgk,5377
5
5
  sglang/bench_serving.py,sha256=1AQzkQ8ci9-rMZEM7wap8I09oPP4AZd93RfXMQRgVro,36386
6
6
  sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
@@ -8,7 +8,7 @@ sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
8
8
  sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
9
9
  sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
10
10
  sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
11
- sglang/version.py,sha256=8KcCYTXH99C2-gCLuPILJvtT9YftRWJsartIx6TQ2ZY,22
11
+ sglang/version.py,sha256=7Z8nSxbc04sgIKYqfKxkmSnG2nnSPT9dpM3RYiFOpUc,28
12
12
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
14
14
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -23,12 +23,12 @@ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThc
23
23
  sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
24
24
  sglang/lang/backend/runtime_endpoint.py,sha256=iVb7SlrpJ1ic92QG5kQUphZUb2EaVWY43dkmAO5pju4,10514
25
25
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
26
- sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
26
+ sglang/srt/conversation.py,sha256=B4QPGOUVdoXRJwWWxSm5pfifGpuBs07fDTxJ1BHUXLw,20003
27
27
  sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
28
28
  sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
29
- sglang/srt/server.py,sha256=SKV6IxR8w0AmuwgHSEOfag_t-f6hAEq9Xg49iBioi2U,22224
30
- sglang/srt/server_args.py,sha256=LI8ehxs0sfI0EDhON-OhNGbDx0-oo9QhfnpYjYwnH54,24405
31
- sglang/srt/utils.py,sha256=amDWXIu1syU-kvdV8bUkNfYaMfpcN22BKZm_2xp59jI,22202
29
+ sglang/srt/server.py,sha256=hb27kEsOzQeVy5HuMVRJNOG8OsFwq0KZBVsZXB2qN1U,23267
30
+ sglang/srt/server_args.py,sha256=_Y7YLlGYOWpR1mtfN8Hmi6hsldkp4q8vLhcNatHhsuQ,24374
31
+ sglang/srt/utils.py,sha256=PYApZ2rFU67TnJaKkkjF9Z93jBPCJkotB0kk1vHGI6Y,22858
32
32
  sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
33
33
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
34
34
  sglang/srt/configs/model_config.py,sha256=36My-o44trhWY3KYDeSFMGvv9XuUtIVI5e7F8VlOTWo,6723
@@ -60,13 +60,14 @@ sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bV
60
60
  sglang/srt/lora/lora.py,sha256=a5j_Yy0s95msVPFgOuH5PCe7sMu0AyZFQ5wL0H-YIg8,14913
61
61
  sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
62
62
  sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
63
+ sglang/srt/managers/data_parallel_controller.py,sha256=GJGfX1-5DoQFZ-EMh_p02nvrOtrOc0UebnULWHhFrss,5765
63
64
  sglang/srt/managers/detokenizer_manager.py,sha256=iCLPdHkL6lAp_-Qew1u4Tyt3jYRkJ8i-Bj3l8TC-uaA,7278
64
65
  sglang/srt/managers/image_processor.py,sha256=9Y9RqyLdbt4uOK7pnJCJIhY77791klskSrEg8U6pyS4,6910
65
- sglang/srt/managers/io_struct.py,sha256=rPyQk5y-jJu4eyoqUVh4M8B14PifjkE8B3K5yI0NX24,12185
66
- sglang/srt/managers/schedule_batch.py,sha256=mqdMg1QB6PNLbBjxkXoP_Ld82R1w34g_13YH82DGMh8,31216
66
+ sglang/srt/managers/io_struct.py,sha256=PxeLOgRJR5raUXExmQHWAnvJZLU0BA_e591zthEOYAQ,12185
67
+ sglang/srt/managers/schedule_batch.py,sha256=rev0x6tp2ex8uW4PPrcfJ6m6WgfhsNLpuPNWForYcGE,31363
67
68
  sglang/srt/managers/schedule_policy.py,sha256=PiTKvsAFwoNWNsv_SFkghIHCL452MdboRc2cmN6ITcU,11935
68
- sglang/srt/managers/scheduler.py,sha256=N9GQnp2SXd8-uN49KmQO-144N27M6h3dxRZuFZ-9AmY,39132
69
- sglang/srt/managers/tokenizer_manager.py,sha256=BAvLW_cRtIgjL0_cwrvDAb7g740fgEddyqaT3JtofR4,24548
69
+ sglang/srt/managers/scheduler.py,sha256=3eJjVZgLDyZWjniQf1Mkv1-1rbQyaOjyM0cRG-bNHAw,40625
70
+ sglang/srt/managers/tokenizer_manager.py,sha256=AI1yfV5A1cpWDDuoelRx55lKMgNWccNOvyDFlKLddFA,24794
70
71
  sglang/srt/managers/tp_worker.py,sha256=fcaW-u7AAX49kQCNn_AEtdRPykRdT6Z6lx1O9LHA15E,4833
71
72
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
72
73
  sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
@@ -75,7 +76,7 @@ sglang/srt/mem_cache/memory_pool.py,sha256=L-5drUt7vlyvple4OcjH1jJRzt2qhVrpc9klZ
75
76
  sglang/srt/mem_cache/radix_cache.py,sha256=00bghOihUm7lA1i4gxxMYQLept9LaHg2ZSXZryuFZZI,10121
76
77
  sglang/srt/model_executor/cuda_graph_runner.py,sha256=iheZYErwFT_W4kJUE1dgbGoQQx7hyOSKa-Yv8guq0DI,10479
77
78
  sglang/srt/model_executor/forward_batch_info.py,sha256=FIQ8XIIP724mIL2l7w7mSEFH452qw-TPpqm43J4YeHM,5822
78
- sglang/srt/model_executor/model_runner.py,sha256=KyglHFIMb5TC-NszN2D85_k7oVQLhbwhUYa7u3RFkoc,22874
79
+ sglang/srt/model_executor/model_runner.py,sha256=Qxp6VyL-yiDzaTQuAQPTRrHiqsZAT0ki94teubxbocc,23237
79
80
  sglang/srt/models/baichuan.py,sha256=50m43kIVo-YamHFwxyiLGG_pCbF7mzUJfhEyuuSmVC8,15100
80
81
  sglang/srt/models/chatglm.py,sha256=XaS_6-ZvRw7X-56sk9xQogqT0NzGEMVpiAdQnC5qbBY,13333
81
82
  sglang/srt/models/commandr.py,sha256=2urK7u2FiwPBl60hMmt-wfaJ8V-ilv6l1B37MUlvSxk,14121
@@ -108,9 +109,9 @@ sglang/srt/models/torch_native_llama.py,sha256=c5GJ_k9zbSOk0PjLCXAK8YebGEy0RUVYZ
108
109
  sglang/srt/models/xverse.py,sha256=i11wEKqqVCoVtH7yo9jfpNyGHxhw7NvTPid3ojmg79s,13634
109
110
  sglang/srt/models/xverse_moe.py,sha256=JwkBhsyusP7e_hAMnomkP8cEmKNCLJPRtwaTERQ0D0M,15818
110
111
  sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
111
- sglang/srt/openai_api/adapter.py,sha256=ULX1lo23r6semogKcbUOXGSgPJi8NJ7IuC0WVvEbVbs,51458
112
+ sglang/srt/openai_api/adapter.py,sha256=bQ2lZGEQGAUkITXshdnCPzx6JN9iqYVvIpfD7uO5rN4,51519
112
113
  sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
113
- sglang/srt/sampling/sampling_batch_info.py,sha256=mtE_kLC6U-X6Q20BVjPWyDOoGc4kcTdIPpcsNeZcRYo,6462
114
+ sglang/srt/sampling/sampling_batch_info.py,sha256=ec5TMw47q2OCrkp2QwN45Ss1RZ-QYv7-KuGFKyGuvsg,6686
114
115
  sglang/srt/sampling/sampling_params.py,sha256=Xwh4_M6PP4SWyGV-zNyIhp4XbRKbeU4251ao8UOlZlI,5704
115
116
  sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
116
117
  sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
@@ -132,8 +133,8 @@ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c
132
133
  sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
133
134
  sglang/test/test_utils.py,sha256=NkJuezjmonjgC3_i_CTBd8KSqWh6W9CLcgoaqvTNK2U,18684
134
135
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
135
- sglang-0.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
136
- sglang-0.3.3.dist-info/METADATA,sha256=zeY2pmiGPJb52zaHqiRHY4OcZqAHPvG_zPyve5KfANc,39063
137
- sglang-0.3.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
138
- sglang-0.3.3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
139
- sglang-0.3.3.dist-info/RECORD,,
136
+ sglang-0.3.3.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
137
+ sglang-0.3.3.post1.dist-info/METADATA,sha256=xfzfAtRkt_PcB8Lw34-Jckq-iukmhDnhu-_8e9SZ3_Y,39186
138
+ sglang-0.3.3.post1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
139
+ sglang-0.3.3.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
140
+ sglang-0.3.3.post1.dist-info/RECORD,,