sglang 0.3.5.post1__py3-none-any.whl → 0.3.5.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,309 @@
1
+ """
2
+ Benchmark the throughput of using the offline LLM engine.
3
+ This script does not launch a server.
4
+ It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
5
+
6
+ # Usage
7
+ ## Sharegpt dataset with default args
8
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
9
+
10
+ ## Random dataset with default args
11
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
12
+
13
+ ## Shared prefix dataset with default args
14
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
15
+
16
+ ## Sharegpt dataset on runtime backend
17
+ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
18
+ """
19
+
20
+ import argparse
21
+ import dataclasses
22
+ import json
23
+ import logging
24
+ import random
25
+ import time
26
+ from typing import List, Optional, Tuple
27
+
28
+ import numpy as np
29
+
30
+ from sglang.api import Engine
31
+ from sglang.bench_serving import (
32
+ get_dataset,
33
+ get_tokenizer,
34
+ sample_random_requests,
35
+ set_ulimit,
36
+ )
37
+ from sglang.srt.server import Runtime
38
+ from sglang.srt.server_args import ServerArgs
39
+
40
+
41
+ @dataclasses.dataclass
42
+ class BenchArgs:
43
+ backend: str = "engine"
44
+ result_filename: str = ""
45
+ dataset_name: str = "sharegpt"
46
+ dataset_path: str = ""
47
+ num_prompts: int = 1000
48
+ sharegpt_output_len: Optional[int] = None
49
+ random_input_len: int = 1024
50
+ random_output_len: int = 1024
51
+ random_range_ratio: float = 0.0
52
+ gen_num_groups: int = 64
53
+ gen_prompts_per_group: int = 16
54
+ gen_system_prompt_len: int = 2048
55
+ gen_question_len: int = 128
56
+ gen_output_len: int = 256
57
+ disable_ignore_eos: bool = False
58
+ seed: int = 1
59
+
60
+ @staticmethod
61
+ def add_cli_args(parser: argparse.ArgumentParser):
62
+ parser.add_argument("--backend", type=str, default=BenchArgs.backend)
63
+ parser.add_argument(
64
+ "--result-filename", type=str, default=BenchArgs.result_filename
65
+ )
66
+ parser.add_argument(
67
+ "--dataset-name",
68
+ type=str,
69
+ default="sharegpt",
70
+ choices=["sharegpt", "random", "generated-shared-prefix"],
71
+ help="Name of the dataset to benchmark on.",
72
+ )
73
+ parser.add_argument(
74
+ "--dataset-path", type=str, default="", help="Path to the dataset."
75
+ )
76
+ parser.add_argument(
77
+ "--num-prompts",
78
+ type=int,
79
+ default=BenchArgs.num_prompts,
80
+ help="Number of prompts to process. Default is 1000.",
81
+ )
82
+ parser.add_argument(
83
+ "--sharegpt-output-len",
84
+ type=int,
85
+ default=BenchArgs.sharegpt_output_len,
86
+ help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
87
+ )
88
+ parser.add_argument(
89
+ "--random-input-len",
90
+ type=int,
91
+ default=BenchArgs.random_input_len,
92
+ help="Number of input tokens per request, used only for random dataset.",
93
+ )
94
+ parser.add_argument(
95
+ "--random-output-len",
96
+ type=int,
97
+ default=BenchArgs.random_output_len,
98
+ help="Number of output tokens per request, used only for random dataset.",
99
+ )
100
+ parser.add_argument(
101
+ "--random-range-ratio",
102
+ type=float,
103
+ default=BenchArgs.random_range_ratio,
104
+ help="Range of sampled ratio of input/output length, "
105
+ "used only for random dataset.",
106
+ )
107
+ parser.add_argument(
108
+ "--gen-num-groups",
109
+ type=int,
110
+ default=BenchArgs.gen_num_groups,
111
+ help="Number of groups with shared prefix, used"
112
+ "only for generate-shared-prefix",
113
+ )
114
+ parser.add_argument(
115
+ "--gen-prompts-per-group",
116
+ type=int,
117
+ default=BenchArgs.gen_prompts_per_group,
118
+ help="Number of prompts per group of shared prefix, used"
119
+ "only for generate-shared-prefix",
120
+ )
121
+ parser.add_argument(
122
+ "--gen-system-prompt-len",
123
+ type=int,
124
+ default=BenchArgs.gen_system_prompt_len,
125
+ help="System prompt length, used" "only for generate-shared-prefix",
126
+ )
127
+ parser.add_argument(
128
+ "--gen-question-len",
129
+ type=int,
130
+ default=BenchArgs.gen_question_len,
131
+ help="Question length, used" "only for generate-shared-prefix",
132
+ )
133
+ parser.add_argument(
134
+ "--gen-output-len",
135
+ type=int,
136
+ default=BenchArgs.gen_output_len,
137
+ help="Target length in tokens for outputs in generated-shared-prefix dataset",
138
+ )
139
+ parser.add_argument(
140
+ "--disable-ignore-eos",
141
+ type=bool,
142
+ default=BenchArgs.disable_ignore_eos,
143
+ help="Disable ignore EOS token",
144
+ )
145
+ parser.add_argument("--seed", type=int, default=1, help="The random seed.")
146
+
147
+ @classmethod
148
+ def from_cli_args(cls, args: argparse.Namespace):
149
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
150
+ return cls(**{attr: getattr(args, attr) for attr in attrs})
151
+
152
+
153
+ def throughput_test_once(
154
+ backend_name: str,
155
+ backend,
156
+ reqs: List[Tuple[str, int, int]],
157
+ ignore_eos: bool,
158
+ ):
159
+ measurement_results = {
160
+ "backend": backend_name,
161
+ "successful_requests": len(reqs),
162
+ "total_latency": -1,
163
+ "total_input_tokens": sum(r[1] for r in reqs),
164
+ "total_output_tokens": -1,
165
+ "request_throughput": -1,
166
+ "input_throughput": -1,
167
+ "output_throughput": -1,
168
+ "total_throughput": -1,
169
+ }
170
+
171
+ prompt = [r[0] for r in reqs]
172
+ sampling_params = [
173
+ {
174
+ "temperature": 0,
175
+ "max_new_tokens": r[2],
176
+ "ignore_eos": ignore_eos,
177
+ }
178
+ for r in reqs
179
+ ]
180
+
181
+ st = time.perf_counter()
182
+ gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
183
+ latency = time.perf_counter() - st
184
+
185
+ if backend_name == "runtime":
186
+ gen_out = json.loads(gen_out)
187
+
188
+ measurement_results["total_latency"] = latency
189
+ measurement_results["total_output_tokens"] = sum(
190
+ o["meta_info"]["completion_tokens"] for o in gen_out
191
+ )
192
+ measurement_results["request_throughput"] = (
193
+ measurement_results["successful_requests"] / latency
194
+ )
195
+ measurement_results["input_throughput"] = (
196
+ measurement_results["total_input_tokens"] / latency
197
+ )
198
+ measurement_results["output_throughput"] = (
199
+ measurement_results["total_output_tokens"] / latency
200
+ )
201
+ measurement_results["total_throughput"] = (
202
+ measurement_results["total_input_tokens"]
203
+ + measurement_results["total_output_tokens"]
204
+ ) / latency
205
+
206
+ return measurement_results
207
+
208
+
209
+ def throughput_test(
210
+ server_args: ServerArgs,
211
+ bench_args: BenchArgs,
212
+ ):
213
+ if bench_args.backend == "engine":
214
+ backend = Engine(**dataclasses.asdict(server_args))
215
+ if not backend:
216
+ raise ValueError("Please provide valid engine arguments")
217
+ elif bench_args.backend == "runtime":
218
+ backend = Runtime(**dataclasses.asdict(server_args))
219
+ else:
220
+ raise ValueError('Please set backend to either "engine" or "runtime"')
221
+
222
+ tokenizer_id = server_args.model_path
223
+ tokenizer = get_tokenizer(tokenizer_id)
224
+
225
+ # Set global environmnets
226
+ set_ulimit()
227
+ random.seed(bench_args.seed)
228
+ np.random.seed(bench_args.seed)
229
+
230
+ # Read dataset
231
+ input_requests = get_dataset(bench_args, tokenizer)
232
+
233
+ warmup_requests = sample_random_requests(
234
+ input_len=20,
235
+ output_len=4,
236
+ num_prompts=2,
237
+ range_ratio=0.8,
238
+ tokenizer=tokenizer,
239
+ dataset_path=bench_args.dataset_path,
240
+ )
241
+
242
+ # Warm up
243
+ throughput_test_once(
244
+ backend_name=bench_args.backend,
245
+ backend=backend,
246
+ reqs=warmup_requests,
247
+ ignore_eos=not bench_args.disable_ignore_eos,
248
+ )
249
+
250
+ result = throughput_test_once(
251
+ backend_name=bench_args.backend,
252
+ backend=backend,
253
+ reqs=input_requests,
254
+ ignore_eos=not bench_args.disable_ignore_eos,
255
+ )
256
+
257
+ if bench_args.result_filename:
258
+ with open(bench_args.result_filename, "a") as fout:
259
+ fout.write(json.dumps(result) + "\n")
260
+
261
+ print(
262
+ "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
263
+ )
264
+ print("{:<40} {:<10}".format("Backend:", result["backend"]))
265
+ print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
266
+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
267
+ print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
268
+ print(
269
+ "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
270
+ )
271
+ print(
272
+ "{:<40} {:<10.2f}".format(
273
+ "Request throughput (req/s):", result["request_throughput"]
274
+ )
275
+ )
276
+ print(
277
+ "{:<40} {:<10.2f}".format(
278
+ "Input token throughput (tok/s):", result["input_throughput"]
279
+ )
280
+ )
281
+ print(
282
+ "{:<40} {:<10.2f}".format(
283
+ "Output token throughput (tok/s):", result["output_throughput"]
284
+ )
285
+ )
286
+ print(
287
+ "{:<40} {:<10.2f}".format(
288
+ "Total token throughput (tok/s):", result["total_throughput"]
289
+ )
290
+ )
291
+ print("=" * 50)
292
+
293
+ return result
294
+
295
+
296
+ if __name__ == "__main__":
297
+ parser = argparse.ArgumentParser()
298
+ ServerArgs.add_cli_args(parser)
299
+ BenchArgs.add_cli_args(parser)
300
+ args = parser.parse_args()
301
+ server_args = ServerArgs.from_cli_args(args)
302
+ bench_args = BenchArgs.from_cli_args(args)
303
+
304
+ logging.basicConfig(
305
+ level=getattr(logging, server_args.log_level.upper()),
306
+ format="%(message)s",
307
+ )
308
+
309
+ throughput_test(server_args, bench_args)
sglang/bench_serving.py CHANGED
@@ -421,6 +421,37 @@ def get_tokenizer(
421
421
  )
422
422
 
423
423
 
424
+ def get_dataset(args, tokenizer):
425
+ if args.dataset_name == "sharegpt":
426
+ input_requests = sample_sharegpt_requests(
427
+ dataset_path=args.dataset_path,
428
+ num_requests=args.num_prompts,
429
+ tokenizer=tokenizer,
430
+ fixed_output_len=args.sharegpt_output_len,
431
+ )
432
+ elif args.dataset_name == "random":
433
+ input_requests = sample_random_requests(
434
+ input_len=args.random_input_len,
435
+ output_len=args.random_output_len,
436
+ num_prompts=args.num_prompts,
437
+ range_ratio=args.random_range_ratio,
438
+ tokenizer=tokenizer,
439
+ dataset_path=args.dataset_path,
440
+ )
441
+ elif args.dataset_name == "generated-shared-prefix":
442
+ input_requests = sample_generated_shared_prefix_requests(
443
+ num_groups=args.gen_num_groups,
444
+ prompts_per_group=args.gen_prompts_per_group,
445
+ system_prompt_len=args.gen_system_prompt_len,
446
+ question_len=args.gen_question_len,
447
+ output_len=args.gen_output_len,
448
+ tokenizer=tokenizer,
449
+ )
450
+ else:
451
+ raise ValueError(f"Unknown dataset: {args.dataset_name}")
452
+ return input_requests
453
+
454
+
424
455
  ASYNC_REQUEST_FUNCS = {
425
456
  "sglang": async_request_sglang_generate,
426
457
  "sglang-native": async_request_sglang_generate,
@@ -443,6 +474,8 @@ class BenchmarkMetrics:
443
474
  input_throughput: float
444
475
  output_throughput: float
445
476
  output_throughput_retokenized: float
477
+ total_throughput: float
478
+ total_throughput_retokenized: float
446
479
  mean_ttft_ms: float
447
480
  median_ttft_ms: float
448
481
  std_ttft_ms: float
@@ -590,7 +623,6 @@ def sample_random_requests(
590
623
  (data["conversations"][0]["value"], data["conversations"][1]["value"])
591
624
  for data in dataset
592
625
  ]
593
-
594
626
  # Shuffle the dataset.
595
627
  random.shuffle(dataset)
596
628
 
@@ -764,6 +796,9 @@ def calculate_metrics(
764
796
  input_throughput=total_input / dur_s,
765
797
  output_throughput=sum(output_lens) / dur_s,
766
798
  output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
799
+ total_throughput=(total_input + sum(output_lens)) / dur_s,
800
+ total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
801
+ / dur_s,
767
802
  mean_ttft_ms=np.mean(ttfts or 0)
768
803
  * 1000, # ttfts is empty if streaming is not supported by backend
769
804
  median_ttft_ms=np.median(ttfts or 0) * 1000,
@@ -881,6 +916,11 @@ async def benchmark(
881
916
  "Output token throughput (tok/s):", metrics.output_throughput
882
917
  )
883
918
  )
919
+ print(
920
+ "{:<40} {:<10.2f}".format(
921
+ "Total token throughput (tok/s):", metrics.total_throughput
922
+ )
923
+ )
884
924
  print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
885
925
  print(
886
926
  "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1098,35 +1138,7 @@ def run_benchmark(args_: argparse.Namespace):
1098
1138
 
1099
1139
  tokenizer = get_tokenizer(tokenizer_id)
1100
1140
 
1101
- if args.dataset_name == "sharegpt":
1102
- assert args.random_input_len is None and args.random_output_len is None
1103
- input_requests = sample_sharegpt_requests(
1104
- dataset_path=args.dataset_path,
1105
- num_requests=args.num_prompts,
1106
- tokenizer=tokenizer,
1107
- fixed_output_len=args.sharegpt_output_len,
1108
- )
1109
- elif args.dataset_name == "random":
1110
- assert args.random_input_len is not None and args.random_output_len is not None
1111
- input_requests = sample_random_requests(
1112
- input_len=args.random_input_len,
1113
- output_len=args.random_output_len,
1114
- num_prompts=args.num_prompts,
1115
- range_ratio=args.random_range_ratio,
1116
- tokenizer=tokenizer,
1117
- dataset_path=args.dataset_path,
1118
- )
1119
- elif args.dataset_name == "generated-shared-prefix":
1120
- input_requests = sample_generated_shared_prefix_requests(
1121
- num_groups=args.gen_num_groups,
1122
- prompts_per_group=args.gen_prompts_per_group,
1123
- system_prompt_len=args.gen_system_prompt_len,
1124
- question_len=args.gen_question_len,
1125
- output_len=args.gen_output_len,
1126
- tokenizer=tokenizer,
1127
- )
1128
- else:
1129
- raise ValueError(f"Unknown dataset: {args.dataset_name}")
1141
+ input_requests = get_dataset(args, tokenizer)
1130
1142
 
1131
1143
  if not args.multi:
1132
1144
  return asyncio.run(
@@ -1229,10 +1241,12 @@ if __name__ == "__main__":
1229
1241
  parser.add_argument(
1230
1242
  "--random-input-len",
1231
1243
  type=int,
1244
+ default=1024,
1232
1245
  help="Number of input tokens per request, used only for random dataset.",
1233
1246
  )
1234
1247
  parser.add_argument(
1235
1248
  "--random-output-len",
1249
+ default=1024,
1236
1250
  type=int,
1237
1251
  help="Number of output tokens per request, used only for random dataset.",
1238
1252
  )
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
13
13
  limitations under the License.
14
14
  """
15
15
 
16
- """The baseclass of backends for grammar-guided constrained decoding."""
16
+ """The baseclass of a backend for grammar-guided constrained decoding."""
17
17
 
18
18
  from concurrent.futures import Future, ThreadPoolExecutor
19
19
  from dataclasses import dataclass
@@ -52,7 +52,7 @@ class BaseGrammarBackend:
52
52
  else:
53
53
  entry.value = self.init_value_impl(key)
54
54
  entry.event.set()
55
- return entry.value.copy()
55
+ return entry.value.copy() if entry.value else None
56
56
 
57
57
  def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
58
58
  raise NotImplementedError()
@@ -62,7 +62,8 @@ class BaseGrammarBackend:
62
62
  entry = self.cache.get(key)
63
63
  if not entry or not entry.event.is_set():
64
64
  return None
65
- return self.cache[key].value.copy()
65
+ val = self.cache[key].value
66
+ return val.copy() if val else None
66
67
 
67
68
  def get_future_value(self, key: Tuple[str, str]) -> Future:
68
69
  return self.executor.submit(self.init_value, key)
@@ -19,9 +19,12 @@ import json
19
19
  import logging
20
20
  from typing import Dict, List, Optional, Tuple, Union
21
21
 
22
+ import interegular
22
23
  import torch
23
24
  from outlines.fsm.guide import RegexGuide
25
+ from outlines.fsm.json_schema import build_regex_from_schema
24
26
  from outlines.models.transformers import TransformerTokenizer
27
+ from pydantic import BaseModel
25
28
 
26
29
  from sglang.srt.constrained.base_grammar_backend import (
27
30
  BaseGrammarBackend,
@@ -32,26 +35,6 @@ from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
32
35
  logger = logging.getLogger(__name__)
33
36
 
34
37
 
35
- try:
36
- from outlines.fsm.json_schema import build_regex_from_object
37
- except ImportError:
38
- # Since outlines 0.0.32, build_regex_from_object is replaced by build_regex_from_schema,
39
- # which only accepts string schema as input.
40
- from outlines.fsm.json_schema import build_regex_from_schema
41
- from pydantic import BaseModel
42
-
43
- def build_regex_from_object(
44
- object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
45
- ):
46
- if isinstance(object, type(BaseModel)):
47
- schema = json.dumps(object.model_json_schema())
48
- elif isinstance(object, Dict):
49
- schema = json.dumps(object)
50
- else:
51
- schema = object
52
- return build_regex_from_schema(schema, whitespace_pattern)
53
-
54
-
55
38
  class OutlinesGrammar(BaseGrammarObject):
56
39
  def __init__(
57
40
  self,
@@ -147,19 +130,36 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
147
130
  key_string,
148
131
  whitespace_pattern=self.whitespace_pattern,
149
132
  )
150
- except NotImplementedError as e:
133
+ except (NotImplementedError, json.decoder.JSONDecodeError) as e:
151
134
  logger.warning(
152
- f"skip invalid json schema: json_schema={key_string}, {e=}"
135
+ f"Skip invalid json_schema: json_schema={key_string}, {e=}"
153
136
  )
154
- return None, key_string
137
+ return None
155
138
  elif key_type == "regex":
156
139
  regex = key_string
157
140
  else:
158
141
  raise ValueError(f"Invalid key_type: {key_type}")
159
142
 
160
- guide = RegexGuide(regex, self.outlines_tokenizer)
143
+ try:
144
+ guide = RegexGuide(regex, self.outlines_tokenizer)
145
+ except interegular.patterns.InvalidSyntax as e:
146
+ logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
147
+ return None
148
+
161
149
  if self.allow_jump_forward:
162
150
  jump_forward_map = OutlinesJumpForwardMap(regex)
163
151
  else:
164
152
  jump_forward_map = None
165
153
  return OutlinesGrammar(guide, jump_forward_map)
154
+
155
+
156
+ def build_regex_from_object(
157
+ object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
158
+ ):
159
+ if isinstance(object, type(BaseModel)):
160
+ schema = json.dumps(object.model_json_schema())
161
+ elif isinstance(object, Dict):
162
+ schema = json.dumps(object)
163
+ else:
164
+ schema = object
165
+ return build_regex_from_schema(schema, whitespace_pattern)
@@ -15,16 +15,29 @@ limitations under the License.
15
15
 
16
16
  """Constrained decoding with xgrammar backend."""
17
17
 
18
+ import logging
18
19
  from typing import List, Tuple
19
20
 
20
21
  import torch
21
- from xgrammar import CachedGrammarCompiler, CompiledGrammar, GrammarMatcher
22
+
23
+ try:
24
+ from xgrammar import CachedGrammarCompiler, CompiledGrammar, GrammarMatcher
25
+
26
+ import_error = None
27
+ except ImportError as e:
28
+ CachedGrammarCompiler = CompiledGrammar = GrammarMatcher = TokenizerInfo = (
29
+ ImportError
30
+ )
31
+ import_error = e
22
32
 
23
33
  from sglang.srt.constrained.base_grammar_backend import (
24
34
  BaseGrammarBackend,
25
35
  BaseGrammarObject,
26
36
  )
27
37
 
38
+ logger = logging.getLogger(__name__)
39
+
40
+
28
41
  MAX_ROLLBACK_TOKENS = 10
29
42
 
30
43
 
@@ -91,15 +104,37 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
91
104
  vocab_size: int,
92
105
  ):
93
106
  super().__init__()
107
+
108
+ if import_error:
109
+ logger.warning(
110
+ f"Ignore import error for the grammar backend: {import_error}"
111
+ )
112
+ self.grammar_cache = None
113
+ return
114
+
94
115
  self.grammar_cache = CachedGrammarCompiler(tokenizer_or_vocab=tokenizer)
95
116
  self.vocab_size = vocab_size
96
117
 
97
118
  def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
119
+ if import_error:
120
+ raise import_error
121
+
98
122
  key_type, key_string = key
99
123
  if key_type == "json":
100
- ctx = self.grammar_cache.get_compiled_grammar_for_json_schema(key_string)
124
+ try:
125
+ ctx = self.grammar_cache.get_compiled_grammar_for_json_schema(
126
+ key_string
127
+ )
128
+ except RuntimeError as e:
129
+ logging.warning(
130
+ f"Skip invalid json_schema: json_schema={key_string}, {e=}"
131
+ )
132
+ return None
101
133
  elif key_type == "regex":
102
- raise ValueError("regex hasn't been supported by xgrammar yet")
134
+ logger.warning(
135
+ "regex hasn't been supported by xgrammar yet. This is skipped."
136
+ )
137
+ return None
103
138
  else:
104
139
  raise ValueError(f"Invalid key_type: {key_type}")
105
140
 
@@ -111,4 +146,5 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
111
146
  return XGrammarGrammar(matcher, self.vocab_size, ctx)
112
147
 
113
148
  def reset(self):
114
- self.grammar_cache.clear()
149
+ if self.grammar_cache:
150
+ self.grammar_cache.clear()
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ from typing import Callable, Optional
2
2
 
3
3
  import torch
4
4
  from torch.nn import functional as F
@@ -98,7 +98,9 @@ def fused_moe_forward_native(
98
98
  renormalize: bool,
99
99
  topk_group: Optional[int] = None,
100
100
  num_expert_group: Optional[int] = None,
101
+ custom_routing_function: Optional[Callable] = None,
101
102
  ) -> torch.Tensor:
103
+ assert custom_routing_function is None
102
104
  topk_weights, topk_ids = select_experts_native(
103
105
  hidden_states=x,
104
106
  router_logits=router_logits,
@@ -114,4 +116,4 @@ def fused_moe_forward_native(
114
116
  x1 = F.silu(torch.einsum("ti,taoi -> tao", x, w1_weights))
115
117
  x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
116
118
  expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
117
- return torch.einsum("tai,ta -> ti", expert_outs, topk_weights)
119
+ return torch.einsum("tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype))
@@ -100,20 +100,6 @@ class DetokenizerManager:
100
100
 
101
101
  if isinstance(recv_obj, BatchEmbeddingOut):
102
102
  # If it is embedding model, no detokenization is needed.
103
- self.send_to_tokenizer.send_pyobj(
104
- BatchEmbeddingOut(
105
- rids=recv_obj.rids,
106
- embeddings=recv_obj.embeddings,
107
- meta_info=recv_obj.meta_info,
108
- finished_reason=recv_obj.finished_reason,
109
- )
110
- )
111
- continue
112
- elif isinstance(recv_obj, UpdateWeightReqOutput):
113
- # If it is a weight update request, no detokenization is needed.
114
- self.send_to_tokenizer.send_pyobj(recv_obj)
115
- continue
116
- elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
117
103
  self.send_to_tokenizer.send_pyobj(recv_obj)
118
104
  continue
119
105
  else:
@@ -114,6 +114,9 @@ class Scheduler:
114
114
  self.recv_from_tokenizer = get_zmq_socket(
115
115
  context, zmq.PULL, port_args.scheduler_input_ipc_name
116
116
  )
117
+ self.send_to_tokenizer = get_zmq_socket(
118
+ context, zmq.PUSH, port_args.tokenizer_ipc_name
119
+ )
117
120
 
118
121
  if server_args.skip_tokenizer_init:
119
122
  # Directly send to the tokenizer/api
@@ -127,6 +130,7 @@ class Scheduler:
127
130
  )
128
131
  else:
129
132
  self.recv_from_tokenizer = None
133
+ self.send_to_tokenizer = SimpleNamespace(send_pyobj=lambda x: None)
130
134
  self.send_to_detokenizer = SimpleNamespace(send_pyobj=lambda x: None)
131
135
 
132
136
  # Init tokenizer
@@ -421,7 +425,7 @@ class Scheduler:
421
425
  self.abort_request(recv_req)
422
426
  elif isinstance(recv_req, UpdateWeightReqInput):
423
427
  success, message = self.update_weights(recv_req)
424
- self.send_to_detokenizer.send_pyobj(
428
+ self.send_to_tokenizer.send_pyobj(
425
429
  UpdateWeightReqOutput(success, message)
426
430
  )
427
431
  elif isinstance(recv_req, ProfileReq):
@@ -430,7 +434,7 @@ class Scheduler:
430
434
  else:
431
435
  self.stop_profile()
432
436
  elif isinstance(recv_req, GetMemPoolSizeReq):
433
- self.send_to_detokenizer.send_pyobj(
437
+ self.send_to_tokenizer.send_pyobj(
434
438
  GetMemPoolSizeReqOutput(self.max_total_num_tokens)
435
439
  )
436
440
  else:
@@ -233,7 +233,10 @@ class ModelRunner:
233
233
 
234
234
  # Prepare the vllm model config
235
235
  monkey_patch_vllm_dummy_weight_loader()
236
- self.load_config = LoadConfig(load_format=self.server_args.load_format)
236
+ self.load_config = LoadConfig(
237
+ load_format=self.server_args.load_format,
238
+ download_dir=self.server_args.download_dir,
239
+ )
237
240
  self.vllm_model_config = VllmModelConfig(
238
241
  model=self.server_args.model_path,
239
242
  quantization=self.server_args.quantization,
@@ -516,8 +516,9 @@ def v1_generate_request(
516
516
  "regex": request.regex,
517
517
  "json_schema": request.json_schema,
518
518
  "n": request.n,
519
- "ignore_eos": request.ignore_eos,
520
519
  "no_stop_trim": request.no_stop_trim,
520
+ "ignore_eos": request.ignore_eos,
521
+ "skip_special_tokens": request.skip_special_tokens,
521
522
  }
522
523
  )
523
524
  return_logprobs.append(request.logprobs is not None and request.logprobs > 0)
@@ -928,7 +929,9 @@ def v1_chat_generate_request(
928
929
  "repetition_penalty": request.repetition_penalty,
929
930
  "regex": request.regex,
930
931
  "n": request.n,
932
+ "no_stop_trim": request.no_stop_trim,
931
933
  "ignore_eos": request.ignore_eos,
934
+ "skip_special_tokens": request.skip_special_tokens,
932
935
  }
933
936
  if request.response_format and request.response_format.type == "json_schema":
934
937
  sampling_params["json_schema"] = convert_json_schema_to_str(
@@ -1166,7 +1169,7 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
1166
1169
  is_first = False
1167
1170
  choice_data = ChatCompletionResponseStreamChoice(
1168
1171
  index=index,
1169
- delta=DeltaMessage(role="assistant"),
1172
+ delta=DeltaMessage(role="assistant", content=""),
1170
1173
  finish_reason=(
1171
1174
  finish_reason["type"] if finish_reason else ""
1172
1175
  ),
@@ -36,7 +36,7 @@ class ModelList(BaseModel):
36
36
  """Model list consists of model cards."""
37
37
 
38
38
  object: str = "list"
39
- data: List[ModelCard] = []
39
+ data: List[ModelCard] = Field(default_factory=list)
40
40
 
41
41
 
42
42
  class ErrorResponse(BaseModel):
@@ -143,7 +143,7 @@ class BatchResponse(BaseModel):
143
143
  expired_at: Optional[int] = None
144
144
  cancelling_at: Optional[int] = None
145
145
  cancelled_at: Optional[int] = None
146
- request_counts: dict = {"total": 0, "completed": 0, "failed": 0}
146
+ request_counts: Optional[dict] = None
147
147
  metadata: Optional[dict] = None
148
148
 
149
149
 
@@ -153,30 +153,31 @@ class CompletionRequest(BaseModel):
153
153
  model: str
154
154
  prompt: Union[List[int], List[List[int]], str, List[str]]
155
155
  best_of: Optional[int] = None
156
- echo: Optional[bool] = False
157
- frequency_penalty: Optional[float] = 0.0
156
+ echo: bool = False
157
+ frequency_penalty: float = 0.0
158
158
  logit_bias: Optional[Dict[str, float]] = None
159
159
  logprobs: Optional[int] = None
160
- max_tokens: Optional[int] = 16
160
+ max_tokens: int = 16
161
161
  n: int = 1
162
- presence_penalty: Optional[float] = 0.0
162
+ presence_penalty: float = 0.0
163
163
  seed: Optional[int] = None
164
- stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
165
- stream: Optional[bool] = False
164
+ stop: Optional[Union[str, List[str]]] = None
165
+ stream: bool = False
166
166
  stream_options: Optional[StreamOptions] = None
167
167
  suffix: Optional[str] = None
168
- temperature: Optional[float] = 1.0
169
- top_p: Optional[float] = 1.0
168
+ temperature: float = 1.0
169
+ top_p: float = 1.0
170
170
  user: Optional[str] = None
171
171
 
172
172
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
173
- regex: Optional[str] = None
174
173
  json_schema: Optional[str] = None
175
- ignore_eos: bool = False
174
+ regex: Optional[str] = None
176
175
  min_tokens: int = 0
177
- repetition_penalty: Optional[float] = 1.0
178
- stop_token_ids: Optional[List[int]] = Field(default_factory=list)
179
- no_stop_trim: Union[bool, List[bool]] = False
176
+ repetition_penalty: float = 1.0
177
+ stop_token_ids: Optional[List[int]] = None
178
+ no_stop_trim: bool = False
179
+ ignore_eos: bool = False
180
+ skip_special_tokens: bool = True
180
181
 
181
182
 
182
183
  class CompletionResponseChoice(BaseModel):
@@ -259,28 +260,30 @@ class ChatCompletionRequest(BaseModel):
259
260
  # https://platform.openai.com/docs/api-reference/chat/create
260
261
  messages: List[ChatCompletionMessageParam]
261
262
  model: str
262
- frequency_penalty: Optional[float] = 0.0
263
+ frequency_penalty: float = 0.0
263
264
  logit_bias: Optional[Dict[str, float]] = None
264
- logprobs: Optional[bool] = False
265
+ logprobs: bool = False
265
266
  top_logprobs: Optional[int] = None
266
267
  max_tokens: Optional[int] = None
267
- n: Optional[int] = 1
268
- presence_penalty: Optional[float] = 0.0
268
+ n: int = 1
269
+ presence_penalty: float = 0.0
269
270
  response_format: Optional[ResponseFormat] = None
270
271
  seed: Optional[int] = None
271
- stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
272
- stream: Optional[bool] = False
272
+ stop: Optional[Union[str, List[str]]] = None
273
+ stream: bool = False
273
274
  stream_options: Optional[StreamOptions] = None
274
- temperature: Optional[float] = 0.7
275
- top_p: Optional[float] = 1.0
275
+ temperature: float = 0.7
276
+ top_p: float = 1.0
276
277
  user: Optional[str] = None
277
278
 
278
279
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
279
280
  regex: Optional[str] = None
280
- min_tokens: Optional[int] = 0
281
- repetition_penalty: Optional[float] = 1.0
282
- stop_token_ids: Optional[List[int]] = Field(default_factory=list)
281
+ min_tokens: int = 0
282
+ repetition_penalty: float = 1.0
283
+ stop_token_ids: Optional[List[int]] = None
284
+ no_stop_trim: bool = False
283
285
  ignore_eos: bool = False
286
+ skip_special_tokens: bool = True
284
287
 
285
288
 
286
289
  class ChatMessage(BaseModel):
@@ -34,13 +34,13 @@ class SamplingParams:
34
34
  frequency_penalty: float = 0.0,
35
35
  presence_penalty: float = 0.0,
36
36
  repetition_penalty: float = 1.0,
37
- ignore_eos: bool = False,
38
- skip_special_tokens: bool = True,
39
37
  spaces_between_special_tokens: bool = True,
40
38
  regex: Optional[str] = None,
41
39
  n: int = 1,
42
40
  json_schema: Optional[str] = None,
43
41
  no_stop_trim: bool = False,
42
+ ignore_eos: bool = False,
43
+ skip_special_tokens: bool = True,
44
44
  ) -> None:
45
45
  self.temperature = temperature
46
46
  self.top_p = top_p
sglang/srt/server.py CHANGED
@@ -139,6 +139,7 @@ async def get_model_info():
139
139
  """Get the model information."""
140
140
  result = {
141
141
  "model_path": tokenizer_manager.model_path,
142
+ "tokenizer_path": tokenizer_manager.server_args.tokenizer_path,
142
143
  "is_generation": tokenizer_manager.is_generation,
143
144
  }
144
145
  return result
@@ -768,7 +769,7 @@ class Engine:
768
769
  self,
769
770
  # The input prompt. It can be a single prompt or a batch of prompts.
770
771
  prompt: Optional[Union[List[str], str]] = None,
771
- sampling_params: Optional[Dict] = None,
772
+ sampling_params: Optional[Union[List[Dict], Dict]] = None,
772
773
  # The token ids for text; one can either specify text or input_ids.
773
774
  input_ids: Optional[Union[List[List[int]], List[int]]] = None,
774
775
  return_logprob: Optional[Union[List[bool], bool]] = False,
sglang/srt/server_args.py CHANGED
@@ -22,7 +22,12 @@ import random
22
22
  import tempfile
23
23
  from typing import List, Optional
24
24
 
25
- from sglang.srt.utils import is_flashinfer_available, is_ipv6, is_port_available
25
+ from sglang.srt.utils import (
26
+ get_gpu_memory_capacity,
27
+ is_flashinfer_available,
28
+ is_ipv6,
29
+ is_port_available,
30
+ )
26
31
 
27
32
  logger = logging.getLogger(__name__)
28
33
 
@@ -64,6 +69,7 @@ class ServerArgs:
64
69
  random_seed: Optional[int] = None
65
70
  constrained_json_whitespace_pattern: Optional[str] = None
66
71
  watchdog_timeout: float = 300
72
+ download_dir: Optional[str] = None
67
73
 
68
74
  # Logging
69
75
  log_level: str = "info"
@@ -142,6 +148,9 @@ class ServerArgs:
142
148
  # Disable chunked prefill
143
149
  self.chunked_prefill_size = None
144
150
 
151
+ if self.random_seed is None:
152
+ self.random_seed = random.randint(0, 1 << 30)
153
+
145
154
  # Mem fraction depends on the tensor parallelism size
146
155
  if self.mem_fraction_static is None:
147
156
  if self.tp_size >= 16:
@@ -155,8 +164,14 @@ class ServerArgs:
155
164
  else:
156
165
  self.mem_fraction_static = 0.88
157
166
 
158
- if self.random_seed is None:
159
- self.random_seed = random.randint(0, 1 << 30)
167
+ # Adjust for GPUs with small memory capacities
168
+ gpu_mem = get_gpu_memory_capacity()
169
+ if gpu_mem < 25000:
170
+ logger.warning(
171
+ "Automatically adjust --chunked-prefill-size for small GPUs."
172
+ )
173
+ self.chunked_prefill_size //= 4 # make it 2048
174
+ self.cuda_graph_max_bs = 4
160
175
 
161
176
  # Deprecation warnings
162
177
  if self.disable_flashinfer:
@@ -405,6 +420,12 @@ class ServerArgs:
405
420
  default=ServerArgs.watchdog_timeout,
406
421
  help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
407
422
  )
423
+ parser.add_argument(
424
+ "--download-dir",
425
+ type=str,
426
+ default=ServerArgs.download_dir,
427
+ help="Model download directory.",
428
+ )
408
429
 
409
430
  # Logging
410
431
  parser.add_argument(
sglang/srt/utils.py CHANGED
@@ -27,6 +27,7 @@ import resource
27
27
  import shutil
28
28
  import signal
29
29
  import socket
30
+ import subprocess
30
31
  import tempfile
31
32
  import time
32
33
  import warnings
@@ -791,3 +792,35 @@ def add_prometheus_middleware(app):
791
792
  # Workaround for 307 Redirect for /metrics
792
793
  metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
793
794
  app.routes.append(metrics_route)
795
+
796
+
797
+ def get_gpu_memory_capacity():
798
+ try:
799
+ # Run nvidia-smi and capture the output
800
+ result = subprocess.run(
801
+ ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
802
+ stdout=subprocess.PIPE,
803
+ stderr=subprocess.PIPE,
804
+ text=True,
805
+ )
806
+
807
+ if result.returncode != 0:
808
+ raise RuntimeError(f"nvidia-smi error: {result.stderr.strip()}")
809
+
810
+ # Parse the output to extract memory values
811
+ memory_values = [
812
+ float(mem)
813
+ for mem in result.stdout.strip().split("\n")
814
+ if re.match(r"^\d+(\.\d+)?$", mem.strip())
815
+ ]
816
+
817
+ if not memory_values:
818
+ raise ValueError("No GPU memory values found.")
819
+
820
+ # Return the minimum memory value
821
+ return min(memory_values)
822
+
823
+ except FileNotFoundError:
824
+ raise RuntimeError(
825
+ "nvidia-smi not found. Ensure NVIDIA drivers are installed and accessible."
826
+ )
sglang/test/test_utils.py CHANGED
@@ -28,8 +28,9 @@ from sglang.utils import get_exception_traceback
28
28
  DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
29
29
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
30
30
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
31
- DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
32
31
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
32
+ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
33
+ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
33
34
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
34
35
  DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
35
36
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
@@ -672,7 +673,7 @@ def run_and_check_memory_leak(
672
673
  if enable_mixed_chunk:
673
674
  other_args += ["--enable-mixed-chunk"]
674
675
  if enable_overlap:
675
- other_args += ["--enable-overlap-scheduler"]
676
+ other_args += ["--enable-overlap-schedule"]
676
677
 
677
678
  model = DEFAULT_MODEL_NAME_FOR_TEST
678
679
  port = random.randint(4000, 5000)
@@ -739,8 +740,7 @@ def run_mmlu_test(
739
740
 
740
741
  try:
741
742
  metrics = run_eval(args)
742
- print(f"{metrics=}")
743
- assert metrics["score"] >= 0.65
743
+ assert metrics["score"] >= 0.65, f"{metrics=}"
744
744
  finally:
745
745
  pass
746
746
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.5.post1"
1
+ __version__ = "0.3.5.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.5.post1
3
+ Version: 0.3.5.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -264,7 +264,7 @@ Requires-Dist: torchao; extra == "runtime-common"
264
264
  Requires-Dist: uvicorn; extra == "runtime-common"
265
265
  Requires-Dist: uvloop; extra == "runtime-common"
266
266
  Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
267
- Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
267
+ Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
268
268
  Requires-Dist: modelscope; extra == "runtime-common"
269
269
  Provides-Extra: srt
270
270
  Requires-Dist: sglang[runtime_common]; extra == "srt"
@@ -1,14 +1,15 @@
1
1
  sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
2
2
  sglang/api.py,sha256=3I9YUJNOeCqwKymZec2JR_agjTyKIx4XoT6IGdZ4_Cs,6953
3
3
  sglang/bench_latency.py,sha256=SSqZjcCNO88ExpT94qBZ5CmuA5o0T8wMTBnxLsNMqik,18259
4
+ sglang/bench_offline_throughput.py,sha256=xBr7gI_ZbrpXXD72Nzu1F228oNyz1jggcblZCeUWJgw,9975
4
5
  sglang/bench_server_latency.py,sha256=N1MODIzcMk74yOWmY19d36aih3ewtHOemLxoieKtdhw,5866
5
- sglang/bench_serving.py,sha256=vYlXSXnAeUuF6oCW7r07pkQgnK9UR42B-XHyDu22erM,47620
6
+ sglang/bench_serving.py,sha256=ytef89P9bqKRaMGXAqq69SmLTlNXWyHyhEraISLKYME,47975
6
7
  sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
7
8
  sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
8
9
  sglang/launch_server.py,sha256=_XIqBcXArYtHTqilOFkYWKZBYXGCMHAxbYOST08LGj0,415
9
10
  sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
10
11
  sglang/utils.py,sha256=eCvD3fZCALr-MuyZxJL7HAeeqqpxAxf4LJrf7OiCbco,11547
11
- sglang/version.py,sha256=zPnEkP8KmACe4vaOxE-TiO3Jo-alnSUGAjnKThcNdBg,28
12
+ sglang/version.py,sha256=NlX-QUNR7ogIH-GcgzllsyHox7ItJoycFEUM_EYuhW4,28
12
13
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
14
  sglang/lang/chat_template.py,sha256=jprS3-In2FTUoedKwZg-HYvDwU8RTIYntOlf2zoN2sU,14814
14
15
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -26,18 +27,18 @@ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bE
26
27
  sglang/srt/conversation.py,sha256=erz6wEXMcSmBlskuUhX2c-MT0EMyqyFpTem9PgastEE,21107
27
28
  sglang/srt/hf_transformers_utils.py,sha256=QbYVTnz0UdaXESPMAaq1OMzzznn95J_l08eXJuB68aU,6618
28
29
  sglang/srt/mm_utils.py,sha256=ml68nWUJhs_FS2FU1oB9UPHKZmF7P2DQHl1ddywn4ao,12272
29
- sglang/srt/server.py,sha256=mpZmCVNSN_Go-mEKaYYhRNDFJHbmsK8WCc786oSCf5c,28685
30
- sglang/srt/server_args.py,sha256=9sosvHumMtf5L6jKnFNQ0_MMIg3BkaRCPmnGY2niQps,29472
31
- sglang/srt/utils.py,sha256=WtUZafw6WjAbjtRn_rTW5i2HgYJ65rrtZGpob3ngeuA,26016
30
+ sglang/srt/server.py,sha256=JUYAE8MDGYou_HbmuR10QFZfg319fGt9VamskvBkpFo,28776
31
+ sglang/srt/server_args.py,sha256=V8sx2oY0yphHC_uATwv4UTiLUFnvMQl85o6y5AyaoXM,30086
32
+ sglang/srt/utils.py,sha256=jGSlxbvI50xEybdupDQNHpsCaF1U_5buADrD149766g,27013
32
33
  sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
33
34
  sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
34
35
  sglang/srt/configs/model_config.py,sha256=mBXeDfFUijQnxd38gVGJ6QxgsiitDklfHvbjYBJFKQY,9470
35
36
  sglang/srt/configs/qwen2vl.py,sha256=AYHuFgJ0bwhWYkD7S6fvP7yJejJnuhy4xp5Q2W-O6ps,4424
36
37
  sglang/srt/constrained/__init__.py,sha256=LHj0-NxDQ7S_N3Pc1gJ-FmIJVN_PTP9ytitWOICSMHk,691
37
- sglang/srt/constrained/base_grammar_backend.py,sha256=jRLKExPzMiM6GjryunJNEVrRMmHV-aJ21VhtB9c6bDw,2194
38
- sglang/srt/constrained/outlines_backend.py,sha256=mrubHYHdalbsgHgeu9Ct5OFUd7RnMok5jLXjdKHv-PE,5857
38
+ sglang/srt/constrained/base_grammar_backend.py,sha256=OPuBSd_F_fRwjVj6YFWBQuGeikj7UQtkTvc-JgEYt4I,2259
39
+ sglang/srt/constrained/outlines_backend.py,sha256=J03QQiT9pkdXyoYGw3Rj6taEyWlIr4VCBvxQ3aMiB8A,5786
39
40
  sglang/srt/constrained/outlines_jump_forward.py,sha256=1fnYxlrc24xjcW3Wx59Hyg0L9hiHIVgMVUsld3UDfW4,6102
40
- sglang/srt/constrained/xgrammar_backend.py,sha256=ZvEDDI_huTn2OjOfQQhqfxJU2w4R1tR1v7PwV98A0u4,3640
41
+ sglang/srt/constrained/xgrammar_backend.py,sha256=wMWqkLN5KhnJXL6GBqbcrhxvAAMx60nG88KIBU1bFSc,4505
41
42
  sglang/srt/layers/activation.py,sha256=7VEkCrx2dvl629Lz0fkJcJfVoZA-ykEdkpTzKEc_drQ,5225
42
43
  sglang/srt/layers/layernorm.py,sha256=HCj8Y_X6MNNdtQU2sWKgyjIqVERxl9dqrmjbBbyJjpE,3796
43
44
  sglang/srt/layers/linear.py,sha256=EOdlpAf6srqxzvPpxcv10KFJKedNc22CGP1qEvpRbDg,46131
@@ -59,19 +60,19 @@ sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=LnuWqGAba03e2
59
60
  sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
60
61
  sglang/srt/layers/fused_moe/fused_moe.py,sha256=N15tWTm2SGuesJxDIJAdV5FsDUpE-15sb_AIgr4swlw,23656
61
62
  sglang/srt/layers/fused_moe/layer.py,sha256=tbHnUJs3uvdDsl3VnwtyGA31VtFouNTPD7h7fPSCYOc,23613
62
- sglang/srt/layers/fused_moe/patch.py,sha256=B9cDtHqHfnWE0QqZAffvUi6cVRKcMBMKDGJWGIaKh3U,3898
63
+ sglang/srt/layers/fused_moe/patch.py,sha256=K5CNLnFVxRPd8_jlY4hW6bj7pAACeCFZQA8y5loqqM4,4029
63
64
  sglang/srt/layers/quantization/__init__.py,sha256=QilMNqgu3eOFUkEjXLSDa1NvoNdi_CAvC8a1hprOgN8,2979
64
65
  sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
65
66
  sglang/srt/lora/lora.py,sha256=meRL7oBUx8mxV_isc3Lp0EIsFQWC2PvaN-fE78BmMwg,14970
66
67
  sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
67
68
  sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
68
69
  sglang/srt/managers/data_parallel_controller.py,sha256=_XB6Ianc8TiqwLTW-7DH6gGjVYBeBU_6WjjaDk0snIY,5686
69
- sglang/srt/managers/detokenizer_manager.py,sha256=pBCcK-wKgPk4Ty-vQFSGovEZEE_yKK1f7YVDW8vDcYw,7962
70
+ sglang/srt/managers/detokenizer_manager.py,sha256=erRgf8RijFrGnYjZawu9an1u2mFPRY3tnxzF9PbKc80,7295
70
71
  sglang/srt/managers/image_processor.py,sha256=Pk_dtXzljTkFt7Acsv1RyDzEqvCvjc7BMngxGhtkpDU,13817
71
72
  sglang/srt/managers/io_struct.py,sha256=O_oHnikwmOexNqH4HP6bwAI5d_jG_C96JGapkLg8B7c,12289
72
73
  sglang/srt/managers/schedule_batch.py,sha256=4BgocYdKFTDCrrBkSXCT75EALBx-3RYnoN3SgtdsHlU,39595
73
74
  sglang/srt/managers/schedule_policy.py,sha256=LH0rh1PiI5LK-dSd3dar8_po6FidiBUuj0Xcp_yNQAA,12295
74
- sglang/srt/managers/scheduler.py,sha256=6vqsrZu2roxzXJpNeFQRbDvERTxqbDmbvrGDp1E7FRA,47926
75
+ sglang/srt/managers/scheduler.py,sha256=ty1sJ9U6JxifIGF4uzZX6CANMJtbjNWPe2k8aRPS6aI,48133
75
76
  sglang/srt/managers/tokenizer_manager.py,sha256=n_XCsCOwLZWCLv1ZJLGjyKgrAWCAQDyEhjnkxOptSa8,24436
76
77
  sglang/srt/managers/tp_worker.py,sha256=S5oim5xrkg1j68hYq6LfC8T533JYmQX9Kabt6U8ZXn4,5726
77
78
  sglang/srt/managers/tp_worker_overlap_thread.py,sha256=j5J4yHyR7w2HgAbN7S__299ADvsoyap5HK63SWMNavQ,7546
@@ -84,7 +85,7 @@ sglang/srt/metrics/collector.py,sha256=9kidVhr4ldbSntAYfzwJt_2CTUFnnej0OoQdxUUwU
84
85
  sglang/srt/metrics/func_timer.py,sha256=xe9UT4bPP1mA4GRZLsCd708cmv1B00hMpUmF7hzAKB4,3344
85
86
  sglang/srt/model_executor/cuda_graph_runner.py,sha256=ZMkyfZpWgDXfBpJ4cenh1TxXtt1O2xqeiXhDkq6E5pU,12936
86
87
  sglang/srt/model_executor/forward_batch_info.py,sha256=61TVExbiXDQRvZ6oevNz9AIxG7e-KVddgj4I6MTivLg,9426
87
- sglang/srt/model_executor/model_runner.py,sha256=AYMLc5Rd32ZyWnI6rERPuIASv6D-uA3ztoj9bh0VpcM,26800
88
+ sglang/srt/model_executor/model_runner.py,sha256=QdFjQRnxZU8r7-MP-NdsnFnPWMRfxa-zTUmKOYmM8HE,26879
88
89
  sglang/srt/models/baichuan.py,sha256=RyvPQvi7wy9VUGvLwG17XttcTp43yRj6c3zNRImBToA,15005
89
90
  sglang/srt/models/chatglm.py,sha256=9hCXTqGX8DMvSPSn6wlK0YNNRWGS4UiS4-xjFsO9hYU,13135
90
91
  sglang/srt/models/commandr.py,sha256=leoQNn4VRqa9SXos6DcrkHVG6-Xp-kjBn2PUgqc9bs8,14051
@@ -123,10 +124,10 @@ sglang/srt/models/torch_native_llama.py,sha256=d8gVNurlVVZ-tD3Uc_aHyGCVUUp1gR8aw
123
124
  sglang/srt/models/xverse.py,sha256=meyCCdrZRYNK70hnmydgwhHa1FTBhKekEdpG0_IGTWY,13564
124
125
  sglang/srt/models/xverse_moe.py,sha256=xlrhJBAlRzxhp5o0WQU_2V5Uvf8I9fwZLOZBh95o3to,15673
125
126
  sglang/srt/models/yivl.py,sha256=xcWqkuZ29FmBBJY6aKetwItWIPl-kfXK-QmgdLONles,4765
126
- sglang/srt/openai_api/adapter.py,sha256=TFRafrvLvxGx93AZ8OByVwW7Y3ozBdAXg6gX5KU6hK8,53238
127
- sglang/srt/openai_api/protocol.py,sha256=EZ6G209rBEDP7cepO2kAYqE8wMe1ksYdN7to1iT97Lw,10248
127
+ sglang/srt/openai_api/adapter.py,sha256=xYBmBLZ_JxfMt_m8LtVe_OB70GV4S9zBOL8e5g_VRvs,53432
128
+ sglang/srt/openai_api/protocol.py,sha256=Mou5JUMKJkxVxoj4n8R4_sgnYY3OcwniiAi2TEM3hfY,10070
128
129
  sglang/srt/sampling/sampling_batch_info.py,sha256=7uoHypbbp4o71DfPmF22R_LeyM_Q9BTxBFg8O4lkd9w,7648
129
- sglang/srt/sampling/sampling_params.py,sha256=O8w5yTLP1dwuCdb8kMBBhMSdMWvWxSv3fz2Eq07Tm88,5192
130
+ sglang/srt/sampling/sampling_params.py,sha256=zzWVm8DxcUDdPwV1MIh5q76mmLwtkun0E08T6U3ZyWA,5192
130
131
  sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
131
132
  sglang/srt/sampling/penaltylib/orchestrator.py,sha256=kizcPnxtRawmDt6utRuhbk4yfNs5H5mx1DAlDVEZRv8,11328
132
133
  sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
@@ -146,10 +147,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
146
147
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
147
148
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
148
149
  sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
149
- sglang/test/test_utils.py,sha256=lgLPp27xQ1NfSdeJ1YUZeOer8I6G8UDce7YPyG637gY,23054
150
+ sglang/test/test_utils.py,sha256=XvIAMeLXr4D7uLxCUSLTKP5Upc1EJd0JX2egL897Jfo,23100
150
151
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=q98pQDikkmvvvvAG-AXMYaYte1iHHW2TFhKGtAeGvdE,12802
151
- sglang-0.3.5.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
152
- sglang-0.3.5.post1.dist-info/METADATA,sha256=bTPgfYz1f3ZJPNiIxNPLOoTIGKACad-XLIZ8DOlszu0,21561
153
- sglang-0.3.5.post1.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
154
- sglang-0.3.5.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
155
- sglang-0.3.5.post1.dist-info/RECORD,,
152
+ sglang-0.3.5.post2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
153
+ sglang-0.3.5.post2.dist-info/METADATA,sha256=ajoktPOWOAmE37TcZw562A22FmxntBUWO4zLOShVKpQ,21568
154
+ sglang-0.3.5.post2.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
155
+ sglang-0.3.5.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
156
+ sglang-0.3.5.post2.dist-info/RECORD,,