sglang 0.3.1.post1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sglang/bench_latency.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """
2
- Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
2
+ Benchmark the latency of running a single static batch.
3
+ This script does not launch a server and uses the low-level APIs.
4
+ It accepts arguments similar to those of launch_server.py.
3
5
 
4
6
  # Usage (latency test)
5
7
  ## with dummy weights:
@@ -0,0 +1,187 @@
1
+ """
2
+ Benchmark the latency of serving a single batch with a real server.
3
+ This script launches a server and uses the HTTP interface.
4
+ It accepts arguments similar to those of launch_server.py.
5
+
6
+ Usage:
7
+
8
+ python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
+ """
10
+
11
+ import argparse
12
+ import dataclasses
13
+ import itertools
14
+ import json
15
+ import multiprocessing
16
+ import os
17
+ import time
18
+ from typing import Tuple
19
+
20
+ import numpy as np
21
+ import requests
22
+
23
+ from sglang.srt.server import launch_server
24
+ from sglang.srt.server_args import ServerArgs
25
+ from sglang.srt.utils import kill_child_process
26
+
27
+
28
+ @dataclasses.dataclass
29
+ class BenchArgs:
30
+ run_name: str = "default"
31
+ batch_size: Tuple[int] = (1,)
32
+ input_len: Tuple[int] = (1024,)
33
+ output_len: Tuple[int] = (16,)
34
+ result_filename: str = "result.jsonl"
35
+
36
+ @staticmethod
37
+ def add_cli_args(parser: argparse.ArgumentParser):
38
+ parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
39
+ parser.add_argument(
40
+ "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
41
+ )
42
+ parser.add_argument(
43
+ "--input-len", type=int, nargs="+", default=BenchArgs.input_len
44
+ )
45
+ parser.add_argument(
46
+ "--output-len", type=int, nargs="+", default=BenchArgs.output_len
47
+ )
48
+ parser.add_argument(
49
+ "--result-filename", type=str, default=BenchArgs.result_filename
50
+ )
51
+
52
+ @classmethod
53
+ def from_cli_args(cls, args: argparse.Namespace):
54
+ # use the default value's type to case the args into correct types.
55
+ attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
56
+ return cls(
57
+ **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
58
+ )
59
+
60
+
61
+ def launch_server_internal(server_args):
62
+ try:
63
+ launch_server(server_args)
64
+ except Exception as e:
65
+ raise e
66
+ finally:
67
+ kill_child_process(os.getpid(), including_parent=False)
68
+
69
+
70
+ def launch_server_process(server_args: ServerArgs):
71
+ proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
72
+ proc.start()
73
+ base_url = f"http://{server_args.host}:{server_args.port}"
74
+ timeout = 600
75
+
76
+ start_time = time.time()
77
+ while time.time() - start_time < timeout:
78
+ try:
79
+ headers = {
80
+ "Content-Type": "application/json; charset=utf-8",
81
+ }
82
+ response = requests.get(f"{base_url}/v1/models", headers=headers)
83
+ if response.status_code == 200:
84
+ return proc, base_url
85
+ except requests.RequestException:
86
+ pass
87
+ time.sleep(10)
88
+ raise TimeoutError("Server failed to start within the timeout period.")
89
+
90
+
91
+ def run_one_case(
92
+ url: str,
93
+ batch_size: int,
94
+ input_len: int,
95
+ output_len: int,
96
+ run_name: str,
97
+ result_filename: str,
98
+ ):
99
+ input_ids = [
100
+ [int(x) for x in np.random.randint(0, high=16384, size=(input_len,))]
101
+ for _ in range(batch_size)
102
+ ]
103
+
104
+ tic = time.time()
105
+ response = requests.post(
106
+ url + "/generate",
107
+ json={
108
+ "input_ids": input_ids,
109
+ "sampling_params": {
110
+ "temperature": 0,
111
+ "max_new_tokens": output_len,
112
+ "ignore_eos": True,
113
+ },
114
+ },
115
+ )
116
+ latency = time.time() - tic
117
+
118
+ _ = response.json()
119
+ output_throughput = batch_size * output_len / latency
120
+ overall_throughput = batch_size * (input_len + output_len) / latency
121
+
122
+ print(f"batch size: {batch_size}")
123
+ print(f"latency: {latency:.2f} s")
124
+ print(f"output throughput: {output_throughput:.2f} token/s")
125
+ print(f"(input + output) throughput: {overall_throughput:.2f} token/s")
126
+
127
+ if result_filename:
128
+ with open(result_filename, "a") as fout:
129
+ res = {
130
+ "run_name": run_name,
131
+ "batch_size": batch_size,
132
+ "input_len": input_len,
133
+ "output_len": output_len,
134
+ "latency": round(latency, 4),
135
+ "output_throughput": round(output_throughput, 2),
136
+ "overall_throughput": round(overall_throughput, 2),
137
+ }
138
+ fout.write(json.dumps(res) + "\n")
139
+
140
+
141
+ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
142
+ proc, base_url = launch_server_process(server_args)
143
+
144
+ # warmup
145
+ run_one_case(
146
+ base_url,
147
+ batch_size=16,
148
+ input_len=1024,
149
+ output_len=16,
150
+ run_name="",
151
+ result_filename="",
152
+ )
153
+
154
+ # benchmark
155
+ try:
156
+ for bs, il, ol in itertools.product(
157
+ bench_args.batch_size, bench_args.input_len, bench_args.output_len
158
+ ):
159
+ run_one_case(
160
+ base_url,
161
+ bs,
162
+ il,
163
+ ol,
164
+ bench_args.run_name,
165
+ bench_args.result_filename,
166
+ )
167
+ finally:
168
+ kill_child_process(proc.pid)
169
+
170
+ print(f"\nResults are saved to {bench_args.result_filename}")
171
+
172
+
173
+ if __name__ == "__main__":
174
+ parser = argparse.ArgumentParser()
175
+ ServerArgs.add_cli_args(parser)
176
+ BenchArgs.add_cli_args(parser)
177
+ # For this script, model-path is not required
178
+ assert (
179
+ parser._actions[1].option_strings[0] == "--model-path"
180
+ ), "options changed, this code need to be updated"
181
+ parser._actions[1].required = False
182
+ args = parser.parse_args()
183
+
184
+ server_args = ServerArgs.from_cli_args(args)
185
+ bench_args = BenchArgs.from_cli_args(args)
186
+
187
+ run_benchmark(server_args, bench_args)
sglang/bench_serving.py CHANGED
@@ -2,7 +2,7 @@
2
2
  # Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
3
3
 
4
4
  """
5
- Benchmark online serving.
5
+ Benchmark online serving with dynamic requests.
6
6
 
7
7
  Usage:
8
8
  python3 -m sglang.bench_serving --backend sglang --num-prompt 10
@@ -19,7 +19,12 @@ from typing import Optional
19
19
  import torch
20
20
  import torch.nn as nn
21
21
  import torch.nn.functional as F
22
- from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
22
+
23
+ from sglang.srt.utils import is_hip
24
+
25
+ if not is_hip():
26
+ from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
27
+
23
28
  from vllm.distributed import (
24
29
  divide,
25
30
  get_tensor_model_parallel_rank,
@@ -29,8 +34,6 @@ from vllm.model_executor.custom_op import CustomOp
29
34
  from vllm.model_executor.layers.quantization import QuantizationConfig
30
35
  from vllm.model_executor.utils import set_weight_attrs
31
36
 
32
- from sglang.srt.utils import is_hip
33
-
34
37
  logger = logging.getLogger(__name__)
35
38
 
36
39
 
@@ -20,16 +20,19 @@ from typing import Optional, Tuple, Union
20
20
 
21
21
  import torch
22
22
  import torch.nn as nn
23
- from flashinfer.norm import (
24
- fused_add_rmsnorm,
25
- gemma_fused_add_rmsnorm,
26
- gemma_rmsnorm,
27
- rmsnorm,
28
- )
29
- from vllm.model_executor.custom_op import CustomOp
30
23
 
31
24
  from sglang.srt.utils import is_hip
32
25
 
26
+ if not is_hip():
27
+ from flashinfer.norm import (
28
+ fused_add_rmsnorm,
29
+ gemma_fused_add_rmsnorm,
30
+ gemma_rmsnorm,
31
+ rmsnorm,
32
+ )
33
+
34
+ from vllm.model_executor.custom_op import CustomOp
35
+
33
36
  logger = logging.getLogger(__name__)
34
37
 
35
38
 
@@ -31,8 +31,11 @@ class Sampler(nn.Module):
31
31
  logits = logits.next_token_logits
32
32
 
33
33
  # Post process logits
34
+ logits = logits.contiguous()
34
35
  logits.div_(sampling_info.temperatures)
35
- probs = logits[:] = torch.softmax(logits, dim=-1)
36
+ probs = torch.softmax(logits, dim=-1)
37
+ logits = None
38
+ del logits
36
39
 
37
40
  if torch.any(torch.isnan(probs)):
38
41
  logger.warning("Detected errors during sampling! NaN in the probability.")
@@ -53,7 +56,11 @@ class Sampler(nn.Module):
53
56
  )
54
57
  else:
55
58
  batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
56
- probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps
59
+ probs,
60
+ uniform_samples,
61
+ sampling_info.top_ks,
62
+ sampling_info.top_ps,
63
+ filter_apply_order="joint",
57
64
  )
58
65
 
59
66
  if not torch.all(success):
@@ -133,6 +133,9 @@ class GenerateReqInput:
133
133
  self.image_data = [None] * num
134
134
  elif not isinstance(self.image_data, list):
135
135
  self.image_data = [self.image_data] * num
136
+ elif isinstance(self.image_data, list):
137
+ # multi-image with n > 1
138
+ self.image_data = self.image_data * num
136
139
 
137
140
  if self.sampling_params is None:
138
141
  self.sampling_params = [{}] * num
@@ -119,19 +119,32 @@ class PrefillAdder:
119
119
  self.running_batch = running_batch
120
120
  self.new_token_ratio = new_token_ratio
121
121
  self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
122
- self.rem_total_tokens_ = self.rem_total_tokens
123
- self.total_tokens = rem_total_tokens
124
122
  self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
125
123
  self.rem_chunk_tokens = rem_chunk_tokens
126
124
  if self.rem_chunk_tokens is not None:
127
125
  self.rem_chunk_tokens -= mixed_with_decode_tokens
128
126
 
127
+ self.cur_rem_tokens = rem_total_tokens - mixed_with_decode_tokens
128
+
129
129
  self.req_states = None
130
130
  self.can_run_list = []
131
131
  self.new_inflight_req = None
132
132
  self.log_hit_tokens = 0
133
133
  self.log_input_tokens = 0
134
134
 
135
+ if running_batch is not None:
136
+ # Pre-remove the tokens which will be occupied by the running requests
137
+ self.rem_total_tokens -= sum(
138
+ [
139
+ min(
140
+ (r.sampling_params.max_new_tokens - len(r.output_ids)),
141
+ CLIP_MAX_NEW_TOKENS,
142
+ )
143
+ * self.new_token_ratio
144
+ for r in running_batch.reqs
145
+ ]
146
+ )
147
+
135
148
  def no_remaining_tokens(self):
136
149
  return (
137
150
  self.rem_total_tokens <= 0
@@ -141,31 +154,14 @@ class PrefillAdder:
141
154
  if self.rem_chunk_tokens is not None
142
155
  else False
143
156
  )
144
- )
145
-
146
- def remove_running_tokens(self, running_batch: ScheduleBatch):
147
- self.rem_total_tokens -= sum(
148
- [
149
- min(
150
- (r.sampling_params.max_new_tokens - len(r.output_ids)),
151
- CLIP_MAX_NEW_TOKENS,
152
- )
153
- * self.new_token_ratio
154
- for r in running_batch.reqs
155
- ]
156
- )
157
- self.rem_total_tokens_ -= sum(
158
- [
159
- r.sampling_params.max_new_tokens - len(r.output_ids)
160
- for r in running_batch.reqs
161
- ]
157
+ or self.cur_rem_tokens <= 0
162
158
  )
163
159
 
164
160
  def _prefill_one_req(
165
161
  self, prefix_len: int, extend_input_len: int, max_new_tokens: int
166
162
  ):
167
163
  self.rem_total_tokens -= extend_input_len + max_new_tokens
168
- self.rem_total_tokens_ -= extend_input_len + max_new_tokens
164
+ self.cur_rem_tokens -= extend_input_len
169
165
  self.rem_input_tokens -= extend_input_len
170
166
  if self.rem_chunk_tokens is not None:
171
167
  self.rem_chunk_tokens -= extend_input_len
@@ -173,29 +169,7 @@ class PrefillAdder:
173
169
  self.log_hit_tokens += prefix_len
174
170
  self.log_input_tokens += extend_input_len
175
171
 
176
- def add_inflight_req_ignore_eos(self, req: Req):
177
- truncated = req.extend_input_len > self.rem_chunk_tokens
178
- req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
179
- req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
180
- self.can_run_list.append(req)
181
-
182
- self._prefill_one_req(
183
- 0,
184
- req.extend_input_len,
185
- (
186
- min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
187
- if not truncated
188
- else 0
189
- ),
190
- )
191
-
192
- # Return if chunked prefill not finished
193
- return req if truncated else None
194
-
195
172
  def add_inflight_req(self, req: Req):
196
- if req.sampling_params.ignore_eos:
197
- return self.add_inflight_req_ignore_eos(req)
198
-
199
173
  truncated = req.extend_input_len > self.rem_chunk_tokens
200
174
  req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
201
175
  req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
@@ -225,7 +199,7 @@ class PrefillAdder:
225
199
  self.rem_total_tokens += delta
226
200
 
227
201
  def add_one_req_ignore_eos(self, req: Req):
228
- def get_req_state(r):
202
+ def add_req_state(r, insert_sort=False):
229
203
  new_token_ratio = (
230
204
  1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
231
205
  )
@@ -235,56 +209,38 @@ class PrefillAdder:
235
209
  tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
236
210
 
237
211
  if tokens_left > 0:
238
- return (tokens_left, tokens_occupied)
239
-
240
- return None
241
-
242
- # Quick Check
243
- can_run = False
244
- if (
245
- req.extend_input_len + req.sampling_params.max_new_tokens
246
- <= self.rem_total_tokens
247
- ):
248
- can_run = True
249
-
250
- if not can_run:
251
- if self.req_states is None:
252
- self.req_states = []
253
- if self.running_batch is not None:
254
- for r in self.running_batch.reqs:
255
- state = get_req_state(r)
256
- if state is not None:
257
- self.req_states.append(state)
258
- for r in self.can_run_list:
259
- state = get_req_state(r)
260
- if state is not None:
261
- self.req_states.append(state)
262
- state = get_req_state(req)
263
- if state is not None:
264
- self.req_states.append(state)
265
-
266
- self.req_states.sort(key=lambda x: x[0])
267
- else:
268
- state = get_req_state(req)
269
- if state is not None:
270
- for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
271
- if tokens_left >= state[0]:
272
- self.req_states.insert(i, state)
212
+ if not insert_sort:
213
+ self.req_states.append((tokens_left, tokens_occupied))
214
+ else:
215
+ for i in range(len(self.req_states)):
216
+ if tokens_left <= self.req_states[i][0]:
273
217
  break
274
- else:
275
- self.req_states.append(state)
276
-
277
- tokens_freed = 0
278
- for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
279
- decode_steps = (
280
- self.req_states[i + 1][0]
281
- if i + 1 < len(self.req_states)
282
- else tokens_left
283
- )
284
- bs = len(self.req_states) - i
285
- if self.total_tokens + tokens_freed - decode_steps * bs <= 0:
286
- return False
287
- tokens_freed += tokens_occupied
218
+ self.req_states.insert(i, (tokens_left, tokens_occupied))
219
+
220
+ if self.req_states is None:
221
+ self.req_states = []
222
+ add_req_state(req)
223
+ if self.running_batch is not None:
224
+ for r in self.running_batch.reqs:
225
+ add_req_state(r)
226
+ for r in self.can_run_list:
227
+ add_req_state(r)
228
+ self.req_states.sort(key=lambda x: x[0])
229
+ else:
230
+ add_req_state(req, insert_sort=True)
231
+
232
+ cur_rem_tokens = self.cur_rem_tokens - len(req.origin_input_ids)
233
+ tokens_freed = 0
234
+ for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
235
+ decode_steps = (
236
+ self.req_states[i + 1][0]
237
+ if i + 1 < len(self.req_states)
238
+ else tokens_left
239
+ )
240
+ bs = len(self.req_states) - i
241
+ if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
242
+ return False
243
+ tokens_freed += tokens_occupied
288
244
 
289
245
  if req.extend_input_len <= self.rem_chunk_tokens:
290
246
  self.can_run_list.append(req)
@@ -40,7 +40,7 @@ global_server_args_dict = {
40
40
  "attention_backend": ServerArgs.attention_backend,
41
41
  "sampling_backend": ServerArgs.sampling_backend,
42
42
  "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
43
- "enable_mla": ServerArgs.enable_mla,
43
+ "disable_mla": ServerArgs.disable_mla,
44
44
  "torchao_config": ServerArgs.torchao_config,
45
45
  }
46
46
 
@@ -445,9 +445,6 @@ class ModelTpServer:
445
445
  num_mixed_running,
446
446
  )
447
447
 
448
- if self.running_batch is not None:
449
- adder.remove_running_tokens(self.running_batch)
450
-
451
448
  has_inflight = self.current_inflight_req is not None
452
449
  if self.current_inflight_req is not None:
453
450
  self.current_inflight_req.init_next_round_input(
@@ -465,9 +462,6 @@ class ModelTpServer:
465
462
  )
466
463
 
467
464
  for req in self.waiting_queue:
468
- if adder.no_remaining_tokens():
469
- break
470
- req.init_next_round_input(None if prefix_computed else self.tree_cache)
471
465
  if (
472
466
  self.lora_paths is not None
473
467
  and len(
@@ -478,6 +472,10 @@ class ModelTpServer:
478
472
  > self.max_loras_per_batch
479
473
  ):
480
474
  break
475
+
476
+ if adder.no_remaining_tokens():
477
+ break
478
+ req.init_next_round_input(None if prefix_computed else self.tree_cache)
481
479
  res = adder.add_one_req(req)
482
480
  if (
483
481
  not res
@@ -507,6 +505,11 @@ class ModelTpServer:
507
505
  else:
508
506
  tree_cache_hit_rate = 0.0
509
507
 
508
+ num_used = self.max_total_num_tokens - (
509
+ self.token_to_kv_pool.available_size()
510
+ + self.tree_cache.evictable_size()
511
+ )
512
+
510
513
  if num_mixed_running > 0:
511
514
  logger.info(
512
515
  f"Prefill batch"
@@ -515,6 +518,7 @@ class ModelTpServer:
515
518
  f"#new-token: {adder.log_input_tokens}, "
516
519
  f"#cached-token: {adder.log_hit_tokens}, "
517
520
  f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
521
+ f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
518
522
  f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
519
523
  )
520
524
  else:
@@ -524,6 +528,7 @@ class ModelTpServer:
524
528
  f"#new-token: {adder.log_input_tokens}, "
525
529
  f"#cached-token: {adder.log_hit_tokens}, "
526
530
  f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
531
+ f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
527
532
  f"#running-req: {running_bs}, "
528
533
  f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
529
534
  )
@@ -108,6 +108,10 @@ class CudaGraphRunner:
108
108
  self.capture_bs = list(range(1, 32)) + [64, 128]
109
109
  else:
110
110
  self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
111
+
112
+ self.capture_bs = [
113
+ bs for bs in self.capture_bs if bs <= model_runner.req_to_token_pool.size
114
+ ]
111
115
  self.compile_bs = (
112
116
  [
113
117
  bs
@@ -118,21 +122,8 @@ class CudaGraphRunner:
118
122
  else []
119
123
  )
120
124
 
121
- # Common inputs
122
- self.max_bs = max(self.capture_bs)
123
- self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda")
124
- self.req_pool_indices = torch.zeros(
125
- (self.max_bs,), dtype=torch.int32, device="cuda"
126
- )
127
- self.seq_lens = torch.ones((self.max_bs,), dtype=torch.int32, device="cuda")
128
- self.position_ids_offsets = torch.ones(
129
- (self.max_bs,), dtype=torch.int32, device="cuda"
130
- )
131
- self.out_cache_loc = torch.zeros(
132
- (self.max_bs,), dtype=torch.int32, device="cuda"
133
- )
134
-
135
125
  # Attention backend
126
+ self.max_bs = max(self.capture_bs)
136
127
  self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs)
137
128
  self.seq_len_fill_value = (
138
129
  self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
@@ -141,6 +132,16 @@ class CudaGraphRunner:
141
132
  if self.use_torch_compile:
142
133
  set_torch_compile_config()
143
134
 
135
+ # Common inputs
136
+ with torch.device("cuda"):
137
+ self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32)
138
+ self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
139
+ self.seq_lens = torch.full(
140
+ (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
141
+ )
142
+ self.position_ids_offsets = torch.ones((self.max_bs,), dtype=torch.int32)
143
+ self.out_cache_loc = torch.zeros((self.max_bs,), dtype=torch.int32)
144
+
144
145
  # Capture
145
146
  try:
146
147
  self.capture()
@@ -86,12 +86,20 @@ class ModelRunner:
86
86
  self.is_multimodal_model = is_multimodal_model(
87
87
  self.model_config.hf_config.architectures
88
88
  )
89
+
90
+ if (
91
+ self.model_config.attention_arch == AttentionArch.MLA
92
+ and not self.server_args.disable_mla
93
+ ):
94
+ logger.info("MLA optimization is tunred on. Use triton backend.")
95
+ self.server_args.attention_backend = "triton"
96
+
89
97
  global_server_args_dict.update(
90
98
  {
91
99
  "attention_backend": server_args.attention_backend,
92
100
  "sampling_backend": server_args.sampling_backend,
93
101
  "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
94
- "enable_mla": server_args.enable_mla,
102
+ "disable_mla": server_args.disable_mla,
95
103
  "torchao_config": server_args.torchao_config,
96
104
  }
97
105
  )
@@ -329,7 +337,7 @@ class ModelRunner:
329
337
  )
330
338
  if (
331
339
  self.model_config.attention_arch == AttentionArch.MLA
332
- and self.server_args.enable_mla
340
+ and not self.server_args.disable_mla
333
341
  ):
334
342
  cell_size = (
335
343
  (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
@@ -392,12 +400,12 @@ class ModelRunner:
392
400
  )
393
401
 
394
402
  self.req_to_token_pool = ReqToTokenPool(
395
- max_num_reqs,
396
- self.model_config.context_len + 8,
403
+ max_num_reqs + 1,
404
+ self.model_config.context_len + 4,
397
405
  )
398
406
  if (
399
407
  self.model_config.attention_arch == AttentionArch.MLA
400
- and self.server_args.enable_mla
408
+ and not self.server_args.disable_mla
401
409
  ):
402
410
  self.token_to_kv_pool = MLATokenToKVPool(
403
411
  self.max_total_num_tokens,
@@ -507,7 +507,7 @@ class DeepseekV2DecoderLayer(nn.Module):
507
507
  rope_theta = getattr(config, "rope_theta", 10000)
508
508
  rope_scaling = getattr(config, "rope_scaling", None)
509
509
  max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
510
- if global_server_args_dict["enable_mla"]:
510
+ if not global_server_args_dict["disable_mla"]:
511
511
  self.self_attn = DeepseekV2AttentionMLA(
512
512
  config=config,
513
513
  hidden_size=self.hidden_size,
@@ -732,7 +732,7 @@ class DeepseekV2ForCausalLM(nn.Module):
732
732
  )
733
733
  weight_loader(param, loaded_weight)
734
734
 
735
- if global_server_args_dict["enable_mla"]:
735
+ if not global_server_args_dict["disable_mla"]:
736
736
  for layer_id in range(self.config.num_hidden_layers):
737
737
  self_attn = self.model.layers[layer_id].self_attn
738
738
  w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
@@ -305,8 +305,6 @@ class LlamaForCausalLM(nn.Module):
305
305
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
306
306
  self.logits_processor = LogitsProcessor(config)
307
307
 
308
- self.param_dict = dict(self.named_parameters())
309
-
310
308
  @torch.no_grad()
311
309
  def forward(
312
310
  self,
@@ -374,7 +372,7 @@ class LlamaForCausalLM(nn.Module):
374
372
  (".gate_up_proj", ".gate_proj", 0),
375
373
  (".gate_up_proj", ".up_proj", 1),
376
374
  ]
377
- params_dict = self.param_dict
375
+ params_dict = dict(self.named_parameters())
378
376
 
379
377
  for name, loaded_weight in weights:
380
378
  if "rotary_emb.inv_freq" in name or "projector" in name:
@@ -36,6 +36,7 @@ class LlamaForClassification(nn.Module):
36
36
  ) -> None:
37
37
  super().__init__()
38
38
  self.config = config
39
+ self.torchao_config = None
39
40
  self.quant_config = quant_config
40
41
  self.model = LlamaModel(config, quant_config=quant_config)
41
42
 
@@ -44,8 +45,6 @@ class LlamaForClassification(nn.Module):
44
45
  )
45
46
  self.eos_token_id = config.eos_token_id
46
47
 
47
- self.param_dict = dict(self.named_parameters())
48
-
49
48
  @torch.no_grad()
50
49
  def forward(
51
50
  self,
@@ -77,7 +76,7 @@ class LlamaForClassification(nn.Module):
77
76
  return logits_output
78
77
 
79
78
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
80
- params_dict = self.param_dict
79
+ params_dict = dict(self.named_parameters())
81
80
 
82
81
  for name, loaded_weight in weights:
83
82
  if "classification_head" in name:
@@ -419,7 +419,7 @@ class MiniCPM3DecoderLayer(nn.Module):
419
419
  rope_theta = getattr(config, "rope_theta", 10000)
420
420
  rope_scaling = getattr(config, "rope_scaling", None)
421
421
  max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
422
- if global_server_args_dict["enable_mla"]:
422
+ if not global_server_args_dict["disable_mla"]:
423
423
  self.self_attn = MiniCPM3AttentionMLA(
424
424
  config=config,
425
425
  hidden_size=self.hidden_size,
@@ -653,7 +653,7 @@ class MiniCPM3ForCausalLM(nn.Module):
653
653
  )
654
654
  weight_loader(param, loaded_weight)
655
655
 
656
- if global_server_args_dict["enable_mla"]:
656
+ if not global_server_args_dict["disable_mla"]:
657
657
  for layer_id in range(self.config.num_hidden_layers):
658
658
  self_attn = self.model.layers[layer_id].self_attn
659
659
  w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
@@ -307,8 +307,6 @@ class XverseForCausalLM(nn.Module):
307
307
  self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
308
308
  self.logits_processor = LogitsProcessor(config)
309
309
 
310
- self.param_dict = dict(self.named_parameters())
311
-
312
310
  @torch.no_grad()
313
311
  def forward(
314
312
  self,
@@ -333,7 +331,7 @@ class XverseForCausalLM(nn.Module):
333
331
  ("gate_up_proj", "gate_proj", 0),
334
332
  ("gate_up_proj", "up_proj", 1),
335
333
  ]
336
- params_dict = self.param_dict
334
+ params_dict = dict(self.named_parameters())
337
335
 
338
336
  def load_weights_per_param(name, loaded_weight):
339
337
  if "rotary_emb.inv_freq" in name or "projector" in name:
@@ -383,8 +383,6 @@ class XverseMoeForCausalLM(nn.Module):
383
383
  )
384
384
  self.logits_processor = LogitsProcessor(config)
385
385
 
386
- self.param_dict = dict(self.named_parameters())
387
-
388
386
  @torch.no_grad()
389
387
  def forward(
390
388
  self,
@@ -406,8 +404,7 @@ class XverseMoeForCausalLM(nn.Module):
406
404
  ("gate_up_proj", "gate_proj", 0),
407
405
  ("gate_up_proj", "up_proj", 1),
408
406
  ]
409
-
410
- params_dict = self.param_dict
407
+ params_dict = dict(self.named_parameters())
411
408
 
412
409
  for name, loaded_weight in weights:
413
410
  if "rotary_emb.inv_freq" in name:
sglang/srt/server_args.py CHANGED
@@ -26,17 +26,6 @@ from sglang.srt.utils import is_hip
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
28
 
29
- class LoRAPathAction(argparse.Action):
30
- def __call__(self, parser, namespace, values, option_string=None):
31
- setattr(namespace, self.dest, {})
32
- for lora_path in values:
33
- if "=" in lora_path:
34
- name, path = lora_path.split("=", 1)
35
- getattr(namespace, self.dest)[name] = path
36
- else:
37
- getattr(namespace, self.dest)[lora_path] = lora_path
38
-
39
-
40
29
  @dataclasses.dataclass
41
30
  class ServerArgs:
42
31
  # Model and tokenizer
@@ -108,12 +97,12 @@ class ServerArgs:
108
97
  disable_cuda_graph_padding: bool = False
109
98
  disable_disk_cache: bool = False
110
99
  disable_custom_all_reduce: bool = False
100
+ disable_mla: bool = False
111
101
  enable_mixed_chunk: bool = False
112
102
  enable_torch_compile: bool = False
113
103
  max_torch_compile_bs: int = 32
114
104
  torchao_config: str = ""
115
105
  enable_p2p_check: bool = False
116
- enable_mla: bool = False
117
106
  triton_attention_reduce_in_fp32: bool = False
118
107
 
119
108
  # LoRA
@@ -173,10 +162,6 @@ class ServerArgs:
173
162
  self.sampling_backend = "pytorch"
174
163
 
175
164
  # Default kernel backends
176
- if self.enable_mla:
177
- logger.info("MLA optimization is tunred on. Use triton backend.")
178
- self.attention_backend = "triton"
179
-
180
165
  if self.attention_backend is None:
181
166
  self.attention_backend = "flashinfer"
182
167
 
@@ -514,6 +499,11 @@ class ServerArgs:
514
499
  default=False,
515
500
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
516
501
  )
502
+ parser.add_argument(
503
+ "--disable-mla",
504
+ action="store_true",
505
+ help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
506
+ )
517
507
  parser.add_argument(
518
508
  "--enable-mixed-chunk",
519
509
  action="store_true",
@@ -541,11 +531,6 @@ class ServerArgs:
541
531
  action="store_true",
542
532
  help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
543
533
  )
544
- parser.add_argument(
545
- "--enable-mla",
546
- action="store_true",
547
- help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
548
- )
549
534
  parser.add_argument(
550
535
  "--triton-attention-reduce-in-fp32",
551
536
  action="store_true",
@@ -623,3 +608,14 @@ class PortArgs:
623
608
  controller_port: int
624
609
  detokenizer_port: int
625
610
  nccl_ports: List[int]
611
+
612
+
613
+ class LoRAPathAction(argparse.Action):
614
+ def __call__(self, parser, namespace, values, option_string=None):
615
+ setattr(namespace, self.dest, {})
616
+ for lora_path in values:
617
+ if "=" in lora_path:
618
+ name, path = lora_path.split("=", 1)
619
+ getattr(namespace, self.dest)[name] = path
620
+ else:
621
+ getattr(namespace, self.dest)[lora_path] = lora_path
@@ -44,7 +44,7 @@ def get_answer_value(answer_str):
44
44
  return INVALID
45
45
 
46
46
 
47
- def main(args):
47
+ def run_eval(args):
48
48
  # Select backend
49
49
  set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
50
50
 
@@ -119,6 +119,12 @@ def main(args):
119
119
  # Dump results
120
120
  dump_state_text("tmp_output_gsm8k.txt", states)
121
121
 
122
+ return {
123
+ "accuracy": acc,
124
+ "latency": latency,
125
+ "output_throughput": output_throughput,
126
+ }
127
+
122
128
 
123
129
  if __name__ == "__main__":
124
130
  parser = argparse.ArgumentParser()
@@ -129,4 +135,4 @@ if __name__ == "__main__":
129
135
  parser.add_argument("--host", type=str, default="http://127.0.0.1")
130
136
  parser.add_argument("--port", type=int, default=30000)
131
137
  args = parser.parse_args()
132
- main(args)
138
+ run_eval(args)
sglang/test/test_utils.py CHANGED
@@ -22,6 +22,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
22
22
  from sglang.srt.utils import kill_child_process
23
23
  from sglang.utils import get_exception_traceback
24
24
 
25
+ DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
25
26
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
26
27
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
27
28
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1.post1"
1
+ __version__ = "0.3.1.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1.post1
3
+ Version: 0.3.1.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -269,7 +269,7 @@ Requires-Dist: peft; extra == "test"
269
269
 
270
270
  --------------------------------------------------------------------------------
271
271
 
272
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
272
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
273
273
 
274
274
  SGLang is a fast serving framework for large language models and vision language models.
275
275
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -278,7 +278,7 @@ The core features include:
278
278
  - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
279
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
280
  - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
281
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
282
282
 
283
283
  ## News
284
284
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
318
  ### Method 2: From source
319
319
  ```
320
320
  # Use the last release branch
321
- git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
322
322
  cd sglang
323
323
 
324
324
  pip install --upgrade pip
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
483
483
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
484
484
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
485
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
- - To enable DeepSeek MLA acceleration, add `--enable-mla`.
487
486
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
488
487
  - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
489
488
  ```
@@ -1,13 +1,14 @@
1
1
  sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
2
  sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
3
- sglang/bench_latency.py,sha256=CDMrch4QwIyb2DTH2kBIgQ6Q8sGHwtrx3Cz49qZNfpU,17078
4
- sglang/bench_serving.py,sha256=6OM5JIDuoxJDg-VLE4ijGGcS8-6ViaidV05lIrZmSzo,36239
3
+ sglang/bench_latency.py,sha256=bA50iUYOxEnLjzY2S4AgwxtSAqujUbGfQFwbLZj5XNc,17160
4
+ sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
5
+ sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
5
6
  sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
6
7
  sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
7
8
  sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
8
9
  sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
9
10
  sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
10
- sglang/version.py,sha256=83xK6WSmRR5ba-i5fDLUmoJT83Eg_dpsWgwcnsUhMpA,28
11
+ sglang/version.py,sha256=U9F0UlFDynnYN5dX-kxehylWCwXo9a6E6W4FfDusfRg,28
11
12
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
13
  sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
13
14
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -26,7 +27,7 @@ sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19
26
27
  sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
27
28
  sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
28
29
  sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
29
- sglang/srt/server_args.py,sha256=M1Bm9u2JRsEptne-kw-D-B_29Q-M6V4UpAM7K-JxXAc,23309
30
+ sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
30
31
  sglang/srt/utils.py,sha256=8yxiMRttCcfswynkNPWD3yZFNAGFz2P1PzSuxHCBGns,22340
31
32
  sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
32
33
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
@@ -35,14 +36,14 @@ sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5
35
36
  sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
36
37
  sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
37
38
  sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
38
- sglang/srt/layers/activation.py,sha256=awcwOODYcVdUtC2JxJ1TGsV8Tru0eACKcxYN6cWHbl4,5148
39
+ sglang/srt/layers/activation.py,sha256=i3omgj3GdUIZBqJNUjpdJsMc2UM3Lx07FT2J1WICrqA,5171
39
40
  sglang/srt/layers/attention_backend.py,sha256=lqMsY4VaOO_szIWoTAinXf1DnP2UsbF32kzvwFySz9w,18119
40
41
  sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
41
- sglang/srt/layers/layernorm.py,sha256=-9Yph4nnMZYX_Q31MUGAimLajNclHXjgDkswpU2BTos,3694
42
+ sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
42
43
  sglang/srt/layers/logits_processor.py,sha256=Js2qSk1Z3uPL2cYO1ARai51f2i8OedV3qdwByQVSJtI,12439
43
44
  sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
44
45
  sglang/srt/layers/radix_attention.py,sha256=EcVO0fUSmgvE_9R-MlpgJq0O_uT8ACuHzbMi19bANYc,1874
45
- sglang/srt/layers/sampler.py,sha256=Q4u46oYu66e34rBNzr50VoXO8FM-assYiCoROolq3Zs,3661
46
+ sglang/srt/layers/sampler.py,sha256=Y0o1bndTGRD713fHMbN5-LRUiyneBkb7bH_QlkkeqSs,3836
46
47
  sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJNLZlE,2703
47
48
  sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
48
49
  sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
@@ -56,38 +57,38 @@ sglang/srt/lora/lora_manager.py,sha256=7J7cGmyy1Ph4HCvLdM-ViAizAbV1snZqD-S7JLWXa
56
57
  sglang/srt/managers/controller_multi.py,sha256=KolZDso2WqH1ZhQw9p1eTmlFRgo4bcvzBxE44_sNE_o,6300
57
58
  sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8HXyMoxJ9sRA,4955
58
59
  sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
59
- sglang/srt/managers/io_struct.py,sha256=bqmL3NDPLqOn6Au3WLF0NOe8Dh7ECMN7BTHCkEZ_Edk,11247
60
- sglang/srt/managers/policy_scheduler.py,sha256=tiBUi2GJU5eQEBK6HfsO1_YjWtFkougo40954DIp4dM,13026
61
- sglang/srt/managers/schedule_batch.py,sha256=ppHYK65GP0dtuCEzpSbGm9uAne5rEoRmW8osLknXJpI,27384
60
+ sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
61
+ sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
62
+ sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
62
63
  sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
63
- sglang/srt/managers/tp_worker.py,sha256=4Hhla9rfGYEdQtzGmxlIEqxt_WVkn2dkLLNQZHgpkf0,39270
64
+ sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
64
65
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
65
66
  sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
66
67
  sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
67
68
  sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
68
69
  sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
69
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=ZeO-8Mg4Tf0iP-L9FXcyhHfNzGWpTPEDGeUoC2lzHTE,10418
70
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
70
71
  sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
71
- sglang/srt/model_executor/model_runner.py,sha256=LoQ7OFVwOiK_BfdpRfitss1TfJ8qrysHgWM-xXu7n2Y,22433
72
+ sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
72
73
  sglang/srt/models/baichuan.py,sha256=NrG1rMJXhemkrUCEf8xKOSDQVsOD-nN8RQz6MWHOg84,15124
73
74
  sglang/srt/models/chatglm.py,sha256=KwxLHBEvK02McXDvBS0gnRxfIvOAu2QP7lgibrj9Nbc,13371
74
75
  sglang/srt/models/commandr.py,sha256=2rAXRZRb4PkJZ4NWEqP_rIgsjxbdZyHpuoMOarqTWzQ,14163
75
76
  sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
76
77
  sglang/srt/models/deepseek.py,sha256=7UJgde1EV9ey6d-CKRcEyTKh1_WhZdatpZiltIuqpik,16006
77
- sglang/srt/models/deepseek_v2.py,sha256=bPaGRL8ieBCXKIf-KY7-D9Rus7Qj3VGvvtERzAXAZWs,28421
78
+ sglang/srt/models/deepseek_v2.py,sha256=1J0pt1jZRcBBGYbgt1wGiuxPcrdpfTEUEaGFqju6TVA,28431
78
79
  sglang/srt/models/exaone.py,sha256=3I5ZoiLotf7U-8c9QJRubpgf6JDx9I_z-ViXQlCC-x8,13087
79
80
  sglang/srt/models/gemma.py,sha256=GkwgGFHgGlXgBZN7s7Wooz5tMyCp1YtgLahU2NOo66M,12273
80
81
  sglang/srt/models/gemma2.py,sha256=sFfCNEm0_OOWElRSTDuroRv8wNMX8v_81Uko9m546KA,14923
81
82
  sglang/srt/models/gpt_bigcode.py,sha256=kzHYogeGXZF4KHpkXA-RGqvs016mA-6klWxD2QJTi9E,10195
82
83
  sglang/srt/models/grok.py,sha256=6I4OwQwNyAbh5GF24_SRm12XYBvM9iGWB-T4TSTJ0wU,14929
83
84
  sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
84
- sglang/srt/models/llama.py,sha256=tjdjlIxJr31vgbzGBP_el9RgYxw1kzvmqnVinnTVVUw,15259
85
- sglang/srt/models/llama_classification.py,sha256=A2ABTUD5u4XoWv1dsIPU7wcCQP3jhbDJblMhLgaiFBA,3402
85
+ sglang/srt/models/llama.py,sha256=nbJwRcG9DnurVNSGLKJjnmBmTXP1_5WZpudth_0PVpw,15216
86
+ sglang/srt/models/llama_classification.py,sha256=HF-69J9qIYdfX0R5wEtIgvafMzprKcXdvF3W_orl_kA,3394
86
87
  sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
87
88
  sglang/srt/models/llava.py,sha256=O4XGdl70Hh4tM_OHapFGHbReC82mbe9xLw6GELKWKhU,24881
88
89
  sglang/srt/models/llavavid.py,sha256=ou5uIuskBoBo0lXvqFFfDLBYYVfehx27n-Lu8X9gpLs,11992
89
90
  sglang/srt/models/minicpm.py,sha256=ioqCsTCE_oF8xqGF5fm5cK9dclK5Y0EQ1UJfyteIDDo,13825
90
- sglang/srt/models/minicpm3.py,sha256=_C96kO3qGK0KRctXZf8LBR9s0sEW0QXWSGU0Vf6OrI8,25206
91
+ sglang/srt/models/minicpm3.py,sha256=McPWyy2fQqfHUhi9Nk36rkvvPAS8RmLOY7Vh4ah5c1w,25216
91
92
  sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
92
93
  sglang/srt/models/mixtral.py,sha256=oRC7mKBrPJhvzkWSabrbeQQQac-jtF4EV6H2Sgjc5JY,13897
93
94
  sglang/srt/models/mixtral_quant.py,sha256=wMACJq78OTWj7HlqPDRNEh8cjrVAjKqJEsOG3CO5xow,14072
@@ -96,8 +97,8 @@ sglang/srt/models/qwen.py,sha256=nqSRzkiZzpRVG6WGQ1MBUclQnXyw8jlvoOq-euM8j5s,995
96
97
  sglang/srt/models/qwen2.py,sha256=9_M-VkHN1_T1XN-gsl_L636QMQ9BLF2WqvTcx_1L6aw,12432
97
98
  sglang/srt/models/qwen2_moe.py,sha256=s7b5XnSvsBYtZZUkjPp442m59CqPJ3HxGUIwXBVWsXw,17153
98
99
  sglang/srt/models/stablelm.py,sha256=30ngpc0Xq3VxzXJlf6svP1oax8Q3krMJkxM8PVKtZWU,11359
99
- sglang/srt/models/xverse.py,sha256=luhp_90ZNkTpXHDCURO4MZBy1vbvHTVCwSe4PYYLWBs,13701
100
- sglang/srt/models/xverse_moe.py,sha256=YR--WZ33G7XEMsS7ZJl1cQ62Q8PDo9gWqpvJBY_cb-M,15886
100
+ sglang/srt/models/xverse.py,sha256=L3g32-je_7JmzF2-hztaIVshHYCIv7jOM3oFs-fb2MY,13658
101
+ sglang/srt/models/xverse_moe.py,sha256=CgDD9cR83UVfTsPU6WcbHVYBrkYKv_kTdwncTIx7Q7U,15842
101
102
  sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
102
103
  sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
103
104
  sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
@@ -109,7 +110,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq
109
110
  sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
110
111
  sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
111
112
  sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
112
- sglang/test/few_shot_gsm8k.py,sha256=uSHEPvUFbAgWKtaqxkhBpQrQV_SlTk0HN9FhjNLpL4g,3731
113
+ sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
113
114
  sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
114
115
  sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
115
116
  sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
@@ -121,10 +122,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
121
122
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
122
123
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
123
124
  sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
124
- sglang/test/test_utils.py,sha256=NLiJqFRWnCeQ-gdCBe0ubNFCsig1CPb1EU-Ay9CtSfU,17109
125
+ sglang/test/test_utils.py,sha256=dsHRd1xLzcjlarxUnDIz2XEHfut7HvqVPwx2Fn7vf10,17179
125
126
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
126
- sglang-0.3.1.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
127
- sglang-0.3.1.post1.dist-info/METADATA,sha256=zswdq5UTi5aLVmpEyjnc7SzIi60yc4w2hlMhckdxmcU,38137
128
- sglang-0.3.1.post1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
129
- sglang-0.3.1.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
130
- sglang-0.3.1.post1.dist-info/RECORD,,
127
+ sglang-0.3.1.post2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
128
+ sglang-0.3.1.post2.dist-info/METADATA,sha256=WxMy8Ur_rjPxqVOoWSFoM3eBHWt0cKGyrtwOUfWL-Vc,38114
129
+ sglang-0.3.1.post2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
130
+ sglang-0.3.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
131
+ sglang-0.3.1.post2.dist-info/RECORD,,