sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. sglang/__init__.py +59 -2
  2. sglang/api.py +40 -11
  3. sglang/backend/anthropic.py +17 -3
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +160 -12
  6. sglang/backend/runtime_endpoint.py +62 -27
  7. sglang/backend/vertexai.py +1 -0
  8. sglang/bench_latency.py +320 -0
  9. sglang/global_config.py +24 -3
  10. sglang/lang/chat_template.py +122 -6
  11. sglang/lang/compiler.py +2 -2
  12. sglang/lang/interpreter.py +206 -98
  13. sglang/lang/ir.py +98 -34
  14. sglang/lang/tracer.py +6 -4
  15. sglang/launch_server.py +4 -1
  16. sglang/launch_server_llavavid.py +32 -0
  17. sglang/srt/constrained/__init__.py +14 -6
  18. sglang/srt/constrained/fsm_cache.py +9 -2
  19. sglang/srt/constrained/jump_forward.py +113 -24
  20. sglang/srt/conversation.py +4 -2
  21. sglang/srt/flush_cache.py +18 -0
  22. sglang/srt/hf_transformers_utils.py +144 -3
  23. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  24. sglang/srt/layers/extend_attention.py +20 -1
  25. sglang/srt/layers/fused_moe.py +596 -0
  26. sglang/srt/layers/logits_processor.py +190 -61
  27. sglang/srt/layers/radix_attention.py +62 -53
  28. sglang/srt/layers/token_attention.py +21 -9
  29. sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
  30. sglang/srt/managers/controller/dp_worker.py +113 -0
  31. sglang/srt/managers/controller/infer_batch.py +908 -0
  32. sglang/srt/managers/controller/manager_multi.py +195 -0
  33. sglang/srt/managers/controller/manager_single.py +177 -0
  34. sglang/srt/managers/controller/model_runner.py +359 -0
  35. sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
  36. sglang/srt/managers/controller/schedule_heuristic.py +65 -0
  37. sglang/srt/managers/controller/tp_worker.py +813 -0
  38. sglang/srt/managers/detokenizer_manager.py +42 -40
  39. sglang/srt/managers/io_struct.py +44 -10
  40. sglang/srt/managers/tokenizer_manager.py +224 -82
  41. sglang/srt/memory_pool.py +52 -59
  42. sglang/srt/model_config.py +97 -2
  43. sglang/srt/models/chatglm.py +399 -0
  44. sglang/srt/models/commandr.py +369 -0
  45. sglang/srt/models/dbrx.py +406 -0
  46. sglang/srt/models/gemma.py +34 -38
  47. sglang/srt/models/gemma2.py +436 -0
  48. sglang/srt/models/grok.py +738 -0
  49. sglang/srt/models/llama2.py +47 -37
  50. sglang/srt/models/llama_classification.py +107 -0
  51. sglang/srt/models/llava.py +92 -27
  52. sglang/srt/models/llavavid.py +298 -0
  53. sglang/srt/models/minicpm.py +366 -0
  54. sglang/srt/models/mixtral.py +302 -127
  55. sglang/srt/models/mixtral_quant.py +372 -0
  56. sglang/srt/models/qwen.py +40 -35
  57. sglang/srt/models/qwen2.py +33 -36
  58. sglang/srt/models/qwen2_moe.py +473 -0
  59. sglang/srt/models/stablelm.py +33 -39
  60. sglang/srt/models/yivl.py +19 -26
  61. sglang/srt/openai_api_adapter.py +411 -0
  62. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
  63. sglang/srt/sampling_params.py +2 -0
  64. sglang/srt/server.py +197 -481
  65. sglang/srt/server_args.py +190 -74
  66. sglang/srt/utils.py +460 -95
  67. sglang/test/test_programs.py +73 -10
  68. sglang/test/test_utils.py +226 -7
  69. sglang/utils.py +97 -27
  70. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
  71. sglang-0.1.21.dist-info/RECORD +82 -0
  72. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
  73. sglang/srt/backend_config.py +0 -13
  74. sglang/srt/managers/router/infer_batch.py +0 -503
  75. sglang/srt/managers/router/manager.py +0 -79
  76. sglang/srt/managers/router/model_rpc.py +0 -686
  77. sglang/srt/managers/router/model_runner.py +0 -514
  78. sglang/srt/managers/router/scheduler.py +0 -70
  79. sglang-0.1.14.dist-info/RECORD +0 -64
  80. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
  81. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
@@ -116,6 +116,23 @@ register_chat_template(
116
116
  )
117
117
  )
118
118
 
119
+ # There is default system prompt for qwen
120
+ # reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
121
+ # The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
122
+ register_chat_template(
123
+ ChatTemplate(
124
+ name="qwen",
125
+ default_system_prompt="You are a helpful assistant.",
126
+ role_prefix_and_suffix={
127
+ "system": ("<|im_start|>system\n", "<|im_end|>\n"),
128
+ "user": ("<|im_start|>user\n", "<|im_end|>\n"),
129
+ "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
130
+ },
131
+ style=ChatTemplateStyle.PLAIN,
132
+ stop_str=("<|im_end|>",),
133
+ )
134
+ )
135
+
119
136
 
120
137
  register_chat_template(
121
138
  ChatTemplate(
@@ -132,6 +149,7 @@ register_chat_template(
132
149
  )
133
150
  )
134
151
 
152
+ # Reference: https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
135
153
  register_chat_template(
136
154
  ChatTemplate(
137
155
  name="vicuna_v1.1",
@@ -148,6 +166,20 @@ register_chat_template(
148
166
  )
149
167
  )
150
168
 
169
+ # Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
170
+ register_chat_template(
171
+ ChatTemplate(
172
+ name="yi-1.5",
173
+ default_system_prompt=None,
174
+ role_prefix_and_suffix={
175
+ "system": ("", ""),
176
+ "user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
177
+ "assistant": ("", "<|im_end|>\n"),
178
+ },
179
+ style=ChatTemplateStyle.PLAIN,
180
+ stop_str=("<|im_end|>",),
181
+ )
182
+ )
151
183
 
152
184
  register_chat_template(
153
185
  ChatTemplate(
@@ -162,10 +194,32 @@ register_chat_template(
162
194
  )
163
195
  )
164
196
 
197
+ register_chat_template(
198
+ ChatTemplate(
199
+ name="llama-3-instruct",
200
+ default_system_prompt=None,
201
+ role_prefix_and_suffix={
202
+ "system": (
203
+ "<|start_header_id|>system<|end_header_id|>\n\n",
204
+ "<|eot_id|>",
205
+ ),
206
+ "user": (
207
+ "<|start_header_id|>user<|end_header_id|>\n\n",
208
+ "<|eot_id|>",
209
+ ),
210
+ "assistant": (
211
+ "<|start_header_id|>assistant<|end_header_id|>\n\n",
212
+ "<|eot_id|>",
213
+ ),
214
+ },
215
+ stop_str=("<|eot_id|>",),
216
+ )
217
+ )
218
+
165
219
  # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
166
220
  register_chat_template(
167
221
  ChatTemplate(
168
- name="yi",
222
+ name="yi-vl",
169
223
  default_system_prompt=(
170
224
  "This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
171
225
  "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像,并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
@@ -192,6 +246,44 @@ register_chat_template(
192
246
  )
193
247
  )
194
248
 
249
+ register_chat_template(
250
+ ChatTemplate(
251
+ name="dbrx-instruct",
252
+ default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
253
+ role_prefix_and_suffix={
254
+ "system": ("<|im_start|>system\n", "<|im_end|>"),
255
+ "user": ("\n<|im_start|>user\n", "<|im_end|>"),
256
+ "assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
257
+ },
258
+ stop_str=("<|im_end|>",),
259
+ )
260
+ )
261
+
262
+ register_chat_template(
263
+ ChatTemplate(
264
+ name="c4ai-command-r",
265
+ default_system_prompt=None,
266
+ role_prefix_and_suffix={
267
+ "system": (
268
+ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
269
+ "<|END_OF_TURN_TOKEN|>",
270
+ ),
271
+ "user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
272
+ "assistant": (
273
+ "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
274
+ "<|END_OF_TURN_TOKEN|>",
275
+ ),
276
+ },
277
+ style=ChatTemplateStyle.PLAIN,
278
+ )
279
+ )
280
+
281
+
282
+ @register_chat_template_matching_function
283
+ def match_dbrx(model_path: str):
284
+ if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
285
+ return get_chat_template("dbrx-instruct")
286
+
195
287
 
196
288
  @register_chat_template_matching_function
197
289
  def match_vicuna(model_path: str):
@@ -199,6 +291,8 @@ def match_vicuna(model_path: str):
199
291
  return get_chat_template("vicuna_v1.1")
200
292
  if "llava-v1.5" in model_path.lower():
201
293
  return get_chat_template("vicuna_v1.1")
294
+ if "llava-next-video-7b" in model_path.lower():
295
+ return get_chat_template("vicuna_v1.1")
202
296
 
203
297
 
204
298
  @register_chat_template_matching_function
@@ -214,22 +308,37 @@ def match_llama2_chat(model_path: str):
214
308
  return get_chat_template("llama-2-chat")
215
309
 
216
310
 
311
+ @register_chat_template_matching_function
312
+ def match_llama3_instruct(model_path: str):
313
+ model_path = model_path.lower()
314
+ if "llama-3" in model_path and "instruct" in model_path:
315
+ return get_chat_template("llama-3-instruct")
316
+
317
+
217
318
  @register_chat_template_matching_function
218
319
  def match_chat_ml(model_path: str):
320
+ # import pdb;pdb.set_trace()
219
321
  model_path = model_path.lower()
220
322
  if "tinyllama" in model_path:
221
323
  return get_chat_template("chatml")
222
- if "qwen" in model_path and "chat" in model_path:
223
- return get_chat_template("chatml")
224
- if "llava-v1.6-34b" in model_path:
324
+ # Now the suffix for qwen2 chat model is "instruct"
325
+ if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path):
326
+ return get_chat_template("qwen")
327
+ if (
328
+ "llava-v1.6-34b" in model_path
329
+ or "llava-v1.6-yi-34b" in model_path
330
+ or "llava-next-video-34b" in model_path
331
+ ):
225
332
  return get_chat_template("chatml-llava")
226
333
 
227
334
 
228
335
  @register_chat_template_matching_function
229
336
  def match_chat_yi(model_path: str):
230
337
  model_path = model_path.lower()
231
- if "yi" in model_path:
232
- return get_chat_template("yi")
338
+ if "yi-vl" in model_path and "llava" not in model_path:
339
+ return get_chat_template("yi-vl")
340
+ elif "yi-1.5" in model_path and "chat" in model_path:
341
+ return get_chat_template("yi-1.5")
233
342
 
234
343
 
235
344
  @register_chat_template_matching_function
@@ -239,6 +348,13 @@ def match_gemma_it(model_path: str):
239
348
  return get_chat_template("gemma-it")
240
349
 
241
350
 
351
+ @register_chat_template_matching_function
352
+ def match_c4ai_command_r(model_path: str):
353
+ model_path = model_path.lower()
354
+ if "c4ai-command-r" in model_path:
355
+ return get_chat_template("c4ai-command-r")
356
+
357
+
242
358
  if __name__ == "__main__":
243
359
  messages = [
244
360
  {"role": "system", "content": None}, # None means default
sglang/lang/compiler.py CHANGED
@@ -4,7 +4,7 @@ from queue import Queue
4
4
  from typing import List, Union
5
5
 
6
6
  from sglang.global_config import global_config
7
- from sglang.lang.interpreter import ProgramState, StreamExecutor, pin_program
7
+ from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
8
8
  from sglang.lang.ir import (
9
9
  SglArgument,
10
10
  SglConstantText,
@@ -184,7 +184,7 @@ class CompiledFunction:
184
184
 
185
185
  # Extract prefix by tracing and cache it
186
186
  if len(batch_kwargs) > 1:
187
- pin_program(self.function, backend)
187
+ cache_program(self.function, backend)
188
188
 
189
189
  # Run all programs
190
190
  if num_threads == "auto":