sglang 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/api.py +30 -4
  3. sglang/backend/litellm.py +2 -2
  4. sglang/backend/openai.py +26 -15
  5. sglang/backend/runtime_endpoint.py +18 -14
  6. sglang/bench_latency.py +317 -0
  7. sglang/global_config.py +5 -1
  8. sglang/lang/chat_template.py +41 -6
  9. sglang/lang/compiler.py +2 -2
  10. sglang/lang/interpreter.py +6 -2
  11. sglang/lang/ir.py +74 -28
  12. sglang/launch_server.py +4 -1
  13. sglang/launch_server_llavavid.py +2 -1
  14. sglang/srt/constrained/__init__.py +14 -6
  15. sglang/srt/constrained/fsm_cache.py +6 -3
  16. sglang/srt/constrained/jump_forward.py +113 -25
  17. sglang/srt/conversation.py +2 -0
  18. sglang/srt/flush_cache.py +2 -0
  19. sglang/srt/hf_transformers_utils.py +68 -9
  20. sglang/srt/layers/extend_attention.py +2 -1
  21. sglang/srt/layers/fused_moe.py +280 -169
  22. sglang/srt/layers/logits_processor.py +106 -42
  23. sglang/srt/layers/radix_attention.py +53 -29
  24. sglang/srt/layers/token_attention.py +4 -1
  25. sglang/srt/managers/controller/dp_worker.py +6 -3
  26. sglang/srt/managers/controller/infer_batch.py +144 -69
  27. sglang/srt/managers/controller/manager_multi.py +5 -5
  28. sglang/srt/managers/controller/manager_single.py +9 -4
  29. sglang/srt/managers/controller/model_runner.py +167 -55
  30. sglang/srt/managers/controller/radix_cache.py +4 -0
  31. sglang/srt/managers/controller/schedule_heuristic.py +2 -0
  32. sglang/srt/managers/controller/tp_worker.py +156 -134
  33. sglang/srt/managers/detokenizer_manager.py +19 -21
  34. sglang/srt/managers/io_struct.py +11 -5
  35. sglang/srt/managers/tokenizer_manager.py +16 -14
  36. sglang/srt/model_config.py +89 -4
  37. sglang/srt/models/chatglm.py +399 -0
  38. sglang/srt/models/commandr.py +2 -2
  39. sglang/srt/models/dbrx.py +1 -1
  40. sglang/srt/models/gemma.py +5 -1
  41. sglang/srt/models/gemma2.py +436 -0
  42. sglang/srt/models/grok.py +204 -137
  43. sglang/srt/models/llama2.py +12 -5
  44. sglang/srt/models/llama_classification.py +107 -0
  45. sglang/srt/models/llava.py +11 -8
  46. sglang/srt/models/llavavid.py +1 -1
  47. sglang/srt/models/minicpm.py +373 -0
  48. sglang/srt/models/mixtral.py +164 -115
  49. sglang/srt/models/mixtral_quant.py +0 -1
  50. sglang/srt/models/qwen.py +1 -1
  51. sglang/srt/models/qwen2.py +1 -1
  52. sglang/srt/models/qwen2_moe.py +454 -0
  53. sglang/srt/models/stablelm.py +1 -1
  54. sglang/srt/models/yivl.py +2 -2
  55. sglang/srt/openai_api_adapter.py +35 -25
  56. sglang/srt/openai_protocol.py +2 -2
  57. sglang/srt/server.py +69 -19
  58. sglang/srt/server_args.py +76 -43
  59. sglang/srt/utils.py +177 -35
  60. sglang/test/test_programs.py +28 -10
  61. sglang/utils.py +4 -3
  62. {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/METADATA +44 -31
  63. sglang-0.1.19.dist-info/RECORD +81 -0
  64. {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/WHEEL +1 -1
  65. sglang/srt/managers/router/infer_batch.py +0 -596
  66. sglang/srt/managers/router/manager.py +0 -82
  67. sglang/srt/managers/router/model_rpc.py +0 -818
  68. sglang/srt/managers/router/model_runner.py +0 -445
  69. sglang/srt/managers/router/radix_cache.py +0 -267
  70. sglang/srt/managers/router/scheduler.py +0 -59
  71. sglang-0.1.17.dist-info/RECORD +0 -81
  72. {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/LICENSE +0 -0
  73. {sglang-0.1.17.dist-info → sglang-0.1.19.dist-info}/top_level.txt +0 -0
sglang/srt/flush_cache.py CHANGED
@@ -1,4 +1,6 @@
1
1
  """
2
+ Flush the KV cache.
3
+
2
4
  Usage:
3
5
  python3 -m sglang.srt.flush_cache --url http://localhost:30000
4
6
  """
@@ -1,10 +1,10 @@
1
1
  """Utilities for Huggingface Transformers."""
2
2
 
3
+ import functools
3
4
  import json
4
5
  import os
5
6
  import warnings
6
- import functools
7
- from typing import Optional, Union, AbstractSet, Collection, Literal
7
+ from typing import AbstractSet, Collection, Literal, Optional, Union
8
8
 
9
9
  from huggingface_hub import snapshot_download
10
10
  from transformers import (
@@ -88,6 +88,9 @@ def get_tokenizer(
88
88
  if tokenizer_name.endswith(".json"):
89
89
  return TiktokenTokenizer(tokenizer_name)
90
90
 
91
+ if tokenizer_name.endswith(".model"):
92
+ return SentencePieceTokenizer(tokenizer_name)
93
+
91
94
  """Gets a tokenizer for the given model name via Huggingface."""
92
95
  if is_multimodal_model(tokenizer_name):
93
96
  processor = get_processor(
@@ -179,6 +182,8 @@ def get_processor(
179
182
  class TiktokenTokenizer:
180
183
  def __init__(self, tokenizer_path):
181
184
  import tiktoken
185
+ from jinja2 import Template
186
+
182
187
  PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
183
188
 
184
189
  # Read JSON
@@ -190,7 +195,8 @@ class TiktokenTokenizer:
190
195
  bytes(item["bytes"]): item["token"] for item in tok_dict["regular_tokens"]
191
196
  }
192
197
  special_tokens = {
193
- bytes(item["bytes"]).decode(): item["token"] for item in tok_dict["special_tokens"]
198
+ bytes(item["bytes"]).decode(): item["token"]
199
+ for item in tok_dict["special_tokens"]
194
200
  }
195
201
  assert tok_dict["word_split"] == "V1"
196
202
 
@@ -202,7 +208,10 @@ class TiktokenTokenizer:
202
208
  }
203
209
  if "default_allowed_special" in tok_dict:
204
210
  default_allowed_special = set(
205
- [bytes(bytes_list).decode() for bytes_list in tok_dict["default_allowed_special"]]
211
+ [
212
+ bytes(bytes_list).decode()
213
+ for bytes_list in tok_dict["default_allowed_special"]
214
+ ]
206
215
  )
207
216
  else:
208
217
  default_allowed_special = None
@@ -211,25 +220,35 @@ class TiktokenTokenizer:
211
220
 
212
221
  tokenizer = tiktoken.Encoding(**kwargs)
213
222
  tokenizer._default_allowed_special = default_allowed_special or set()
223
+ tokenizer._default_allowed_special |= {"<|separator|>"}
214
224
 
215
225
  def encode_patched(
216
226
  self,
217
227
  text: str,
218
228
  *,
219
- allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
229
+ allowed_special: Union[
230
+ Literal["all"], AbstractSet[str]
231
+ ] = set(), # noqa: B006
220
232
  disallowed_special: Union[Literal["all"], Collection[str]] = "all",
221
233
  ) -> list[int]:
222
234
  if isinstance(allowed_special, set):
223
235
  allowed_special |= self._default_allowed_special
224
236
  return tiktoken.Encoding.encode(
225
- self, text, allowed_special=allowed_special, disallowed_special=disallowed_special
237
+ self,
238
+ text,
239
+ allowed_special=allowed_special,
240
+ disallowed_special=disallowed_special,
226
241
  )
242
+
227
243
  tokenizer.encode = functools.partial(encode_patched, tokenizer)
228
244
 
229
245
  # Convert to HF interface
230
246
  self.tokenizer = tokenizer
231
247
  self.eos_token_id = tokenizer._special_tokens["<|eos|>"]
232
248
  self.vocab_size = tokenizer.n_vocab
249
+ self.chat_template = Template(
250
+ "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
251
+ )
233
252
 
234
253
  def encode(self, x, add_special_tokens=False):
235
254
  return self.tokenizer.encode(x)
@@ -237,10 +256,50 @@ class TiktokenTokenizer:
237
256
  def decode(self, x):
238
257
  return self.tokenizer.decode(x)
239
258
 
240
- def batch_decode(self, batch, skip_special_tokens=True, spaces_between_special_tokens=False):
259
+ def batch_decode(
260
+ self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
261
+ ):
241
262
  if isinstance(batch[0], int):
242
263
  batch = [[x] for x in batch]
243
264
  return self.tokenizer.decode_batch(batch)
244
265
 
245
- def convert_ids_to_tokens(self, index):
246
- return self.tokenizer.decode_single_token_bytes(index).decode("utf-8", errors="ignore")
266
+ def apply_chat_template(self, messages, tokenize, add_generation_prompt):
267
+ ret = self.chat_template.render(
268
+ messages=messages, add_generation_prompt=add_generation_prompt
269
+ )
270
+ return self.encode(ret) if tokenize else ret
271
+
272
+
273
+ class SentencePieceTokenizer:
274
+ def __init__(self, tokenizer_path):
275
+ import sentencepiece as spm
276
+ from jinja2 import Template
277
+
278
+ tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
279
+
280
+ # Convert to HF interface
281
+ self.tokenizer = tokenizer
282
+ self.eos_token_id = tokenizer.eos_id()
283
+ self.vocab_size = tokenizer.vocab_size()
284
+ self.chat_template = Template(
285
+ "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
286
+ )
287
+
288
+ def encode(self, x, add_special_tokens=False):
289
+ return self.tokenizer.encode(x)
290
+
291
+ def decode(self, x):
292
+ return self.tokenizer.decode(x)
293
+
294
+ def batch_decode(
295
+ self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
296
+ ):
297
+ if isinstance(batch[0], int):
298
+ batch = [[x] for x in batch]
299
+ return self.tokenizer.decode(batch)
300
+
301
+ def apply_chat_template(self, messages, tokenize, add_generation_prompt):
302
+ ret = self.chat_template.render(
303
+ messages=messages, add_generation_prompt=add_generation_prompt
304
+ )
305
+ return self.encode(ret) if tokenize else ret
@@ -191,6 +191,7 @@ def extend_attention_fwd(
191
191
  b_seq_len_extend,
192
192
  max_len_in_batch,
193
193
  max_len_extend,
194
+ sm_scale=None,
194
195
  logit_cap=-1,
195
196
  ):
196
197
  """
@@ -213,7 +214,7 @@ def extend_attention_fwd(
213
214
  else:
214
215
  BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
215
216
 
216
- sm_scale = 1.0 / (Lq**0.5)
217
+ sm_scale = 1.0 / (Lq**0.5) if sm_scale is None else sm_scale
217
218
  batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
218
219
  kv_group_num = q_extend.shape[1] // k_extend.shape[1]
219
220