sglang 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/api.py +30 -4
- sglang/backend/litellm.py +2 -2
- sglang/backend/openai.py +26 -15
- sglang/backend/runtime_endpoint.py +18 -14
- sglang/bench_latency.py +317 -0
- sglang/global_config.py +5 -1
- sglang/lang/chat_template.py +41 -6
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +6 -2
- sglang/lang/ir.py +74 -28
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +2 -1
- sglang/srt/constrained/__init__.py +14 -6
- sglang/srt/constrained/fsm_cache.py +6 -3
- sglang/srt/constrained/jump_forward.py +113 -25
- sglang/srt/conversation.py +2 -0
- sglang/srt/flush_cache.py +2 -0
- sglang/srt/hf_transformers_utils.py +68 -9
- sglang/srt/layers/extend_attention.py +2 -1
- sglang/srt/layers/fused_moe.py +280 -169
- sglang/srt/layers/logits_processor.py +106 -42
- sglang/srt/layers/radix_attention.py +53 -29
- sglang/srt/layers/token_attention.py +4 -1
- sglang/srt/managers/controller/dp_worker.py +6 -3
- sglang/srt/managers/controller/infer_batch.py +144 -69
- sglang/srt/managers/controller/manager_multi.py +5 -5
- sglang/srt/managers/controller/manager_single.py +9 -4
- sglang/srt/managers/controller/model_runner.py +167 -55
- sglang/srt/managers/controller/radix_cache.py +4 -0
- sglang/srt/managers/controller/schedule_heuristic.py +2 -0
- sglang/srt/managers/controller/tp_worker.py +156 -134
- sglang/srt/managers/detokenizer_manager.py +19 -21
- sglang/srt/managers/io_struct.py +11 -5
- sglang/srt/managers/tokenizer_manager.py +16 -14
- sglang/srt/model_config.py +89 -4
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +2 -2
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/gemma.py +5 -1
- sglang/srt/models/gemma2.py +436 -0
- sglang/srt/models/grok.py +204 -137
- sglang/srt/models/llama2.py +12 -5
- sglang/srt/models/llama_classification.py +107 -0
- sglang/srt/models/llava.py +11 -8
- sglang/srt/models/llavavid.py +1 -1
- sglang/srt/models/minicpm.py +373 -0
- sglang/srt/models/mixtral.py +164 -115
- sglang/srt/models/mixtral_quant.py +0 -1
- sglang/srt/models/qwen.py +1 -1
- sglang/srt/models/qwen2.py +1 -1
- sglang/srt/models/qwen2_moe.py +454 -0
- sglang/srt/models/stablelm.py +1 -1
- sglang/srt/models/yivl.py +2 -2
- sglang/srt/openai_api_adapter.py +35 -25
- sglang/srt/openai_protocol.py +2 -2
- sglang/srt/server.py +69 -19
- sglang/srt/server_args.py +76 -43
- sglang/srt/utils.py +177 -35
- sglang/test/test_programs.py +28 -10
- sglang/utils.py +4 -3
- {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/METADATA +44 -31
- sglang-0.1.19.dist-info/RECORD +81 -0
- {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/WHEEL +1 -1
- sglang/srt/managers/router/infer_batch.py +0 -596
- sglang/srt/managers/router/manager.py +0 -82
- sglang/srt/managers/router/model_rpc.py +0 -818
- sglang/srt/managers/router/model_runner.py +0 -445
- sglang/srt/managers/router/radix_cache.py +0 -267
- sglang/srt/managers/router/scheduler.py +0 -59
- sglang-0.1.17.dist-info/RECORD +0 -81
- {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
- {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0
sglang/srt/flush_cache.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
"""Utilities for Huggingface Transformers."""
|
2
2
|
|
3
|
+
import functools
|
3
4
|
import json
|
4
5
|
import os
|
5
6
|
import warnings
|
6
|
-
import
|
7
|
-
from typing import Optional, Union, AbstractSet, Collection, Literal
|
7
|
+
from typing import AbstractSet, Collection, Literal, Optional, Union
|
8
8
|
|
9
9
|
from huggingface_hub import snapshot_download
|
10
10
|
from transformers import (
|
@@ -88,6 +88,9 @@ def get_tokenizer(
|
|
88
88
|
if tokenizer_name.endswith(".json"):
|
89
89
|
return TiktokenTokenizer(tokenizer_name)
|
90
90
|
|
91
|
+
if tokenizer_name.endswith(".model"):
|
92
|
+
return SentencePieceTokenizer(tokenizer_name)
|
93
|
+
|
91
94
|
"""Gets a tokenizer for the given model name via Huggingface."""
|
92
95
|
if is_multimodal_model(tokenizer_name):
|
93
96
|
processor = get_processor(
|
@@ -179,6 +182,8 @@ def get_processor(
|
|
179
182
|
class TiktokenTokenizer:
|
180
183
|
def __init__(self, tokenizer_path):
|
181
184
|
import tiktoken
|
185
|
+
from jinja2 import Template
|
186
|
+
|
182
187
|
PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
183
188
|
|
184
189
|
# Read JSON
|
@@ -190,7 +195,8 @@ class TiktokenTokenizer:
|
|
190
195
|
bytes(item["bytes"]): item["token"] for item in tok_dict["regular_tokens"]
|
191
196
|
}
|
192
197
|
special_tokens = {
|
193
|
-
bytes(item["bytes"]).decode(): item["token"]
|
198
|
+
bytes(item["bytes"]).decode(): item["token"]
|
199
|
+
for item in tok_dict["special_tokens"]
|
194
200
|
}
|
195
201
|
assert tok_dict["word_split"] == "V1"
|
196
202
|
|
@@ -202,7 +208,10 @@ class TiktokenTokenizer:
|
|
202
208
|
}
|
203
209
|
if "default_allowed_special" in tok_dict:
|
204
210
|
default_allowed_special = set(
|
205
|
-
[
|
211
|
+
[
|
212
|
+
bytes(bytes_list).decode()
|
213
|
+
for bytes_list in tok_dict["default_allowed_special"]
|
214
|
+
]
|
206
215
|
)
|
207
216
|
else:
|
208
217
|
default_allowed_special = None
|
@@ -211,25 +220,35 @@ class TiktokenTokenizer:
|
|
211
220
|
|
212
221
|
tokenizer = tiktoken.Encoding(**kwargs)
|
213
222
|
tokenizer._default_allowed_special = default_allowed_special or set()
|
223
|
+
tokenizer._default_allowed_special |= {"<|separator|>"}
|
214
224
|
|
215
225
|
def encode_patched(
|
216
226
|
self,
|
217
227
|
text: str,
|
218
228
|
*,
|
219
|
-
allowed_special: Union[
|
229
|
+
allowed_special: Union[
|
230
|
+
Literal["all"], AbstractSet[str]
|
231
|
+
] = set(), # noqa: B006
|
220
232
|
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
|
221
233
|
) -> list[int]:
|
222
234
|
if isinstance(allowed_special, set):
|
223
235
|
allowed_special |= self._default_allowed_special
|
224
236
|
return tiktoken.Encoding.encode(
|
225
|
-
self,
|
237
|
+
self,
|
238
|
+
text,
|
239
|
+
allowed_special=allowed_special,
|
240
|
+
disallowed_special=disallowed_special,
|
226
241
|
)
|
242
|
+
|
227
243
|
tokenizer.encode = functools.partial(encode_patched, tokenizer)
|
228
244
|
|
229
245
|
# Convert to HF interface
|
230
246
|
self.tokenizer = tokenizer
|
231
247
|
self.eos_token_id = tokenizer._special_tokens["<|eos|>"]
|
232
248
|
self.vocab_size = tokenizer.n_vocab
|
249
|
+
self.chat_template = Template(
|
250
|
+
"{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
|
251
|
+
)
|
233
252
|
|
234
253
|
def encode(self, x, add_special_tokens=False):
|
235
254
|
return self.tokenizer.encode(x)
|
@@ -237,10 +256,50 @@ class TiktokenTokenizer:
|
|
237
256
|
def decode(self, x):
|
238
257
|
return self.tokenizer.decode(x)
|
239
258
|
|
240
|
-
def batch_decode(
|
259
|
+
def batch_decode(
|
260
|
+
self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
|
261
|
+
):
|
241
262
|
if isinstance(batch[0], int):
|
242
263
|
batch = [[x] for x in batch]
|
243
264
|
return self.tokenizer.decode_batch(batch)
|
244
265
|
|
245
|
-
def
|
246
|
-
|
266
|
+
def apply_chat_template(self, messages, tokenize, add_generation_prompt):
|
267
|
+
ret = self.chat_template.render(
|
268
|
+
messages=messages, add_generation_prompt=add_generation_prompt
|
269
|
+
)
|
270
|
+
return self.encode(ret) if tokenize else ret
|
271
|
+
|
272
|
+
|
273
|
+
class SentencePieceTokenizer:
|
274
|
+
def __init__(self, tokenizer_path):
|
275
|
+
import sentencepiece as spm
|
276
|
+
from jinja2 import Template
|
277
|
+
|
278
|
+
tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
|
279
|
+
|
280
|
+
# Convert to HF interface
|
281
|
+
self.tokenizer = tokenizer
|
282
|
+
self.eos_token_id = tokenizer.eos_id()
|
283
|
+
self.vocab_size = tokenizer.vocab_size()
|
284
|
+
self.chat_template = Template(
|
285
|
+
"{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
|
286
|
+
)
|
287
|
+
|
288
|
+
def encode(self, x, add_special_tokens=False):
|
289
|
+
return self.tokenizer.encode(x)
|
290
|
+
|
291
|
+
def decode(self, x):
|
292
|
+
return self.tokenizer.decode(x)
|
293
|
+
|
294
|
+
def batch_decode(
|
295
|
+
self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
|
296
|
+
):
|
297
|
+
if isinstance(batch[0], int):
|
298
|
+
batch = [[x] for x in batch]
|
299
|
+
return self.tokenizer.decode(batch)
|
300
|
+
|
301
|
+
def apply_chat_template(self, messages, tokenize, add_generation_prompt):
|
302
|
+
ret = self.chat_template.render(
|
303
|
+
messages=messages, add_generation_prompt=add_generation_prompt
|
304
|
+
)
|
305
|
+
return self.encode(ret) if tokenize else ret
|
@@ -191,6 +191,7 @@ def extend_attention_fwd(
|
|
191
191
|
b_seq_len_extend,
|
192
192
|
max_len_in_batch,
|
193
193
|
max_len_extend,
|
194
|
+
sm_scale=None,
|
194
195
|
logit_cap=-1,
|
195
196
|
):
|
196
197
|
"""
|
@@ -213,7 +214,7 @@ def extend_attention_fwd(
|
|
213
214
|
else:
|
214
215
|
BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
|
215
216
|
|
216
|
-
sm_scale = 1.0 / (Lq**0.5)
|
217
|
+
sm_scale = 1.0 / (Lq**0.5) if sm_scale is None else sm_scale
|
217
218
|
batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
|
218
219
|
kv_group_num = q_extend.shape[1] // k_extend.shape[1]
|
219
220
|
|