sglang 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. sglang/__init__.py +55 -2
  2. sglang/api.py +3 -5
  3. sglang/backend/anthropic.py +18 -4
  4. sglang/backend/openai.py +2 -1
  5. sglang/backend/runtime_endpoint.py +18 -5
  6. sglang/backend/vertexai.py +1 -0
  7. sglang/global_config.py +1 -0
  8. sglang/lang/chat_template.py +74 -0
  9. sglang/lang/interpreter.py +40 -16
  10. sglang/lang/tracer.py +6 -4
  11. sglang/launch_server.py +2 -1
  12. sglang/srt/constrained/fsm_cache.py +1 -0
  13. sglang/srt/constrained/jump_forward.py +1 -0
  14. sglang/srt/conversation.py +2 -2
  15. sglang/srt/hf_transformers_utils.py +2 -1
  16. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  17. sglang/srt/layers/extend_attention.py +1 -0
  18. sglang/srt/layers/logits_processor.py +114 -54
  19. sglang/srt/layers/radix_attention.py +2 -1
  20. sglang/srt/layers/token_attention.py +1 -0
  21. sglang/srt/managers/detokenizer_manager.py +5 -1
  22. sglang/srt/managers/io_struct.py +12 -0
  23. sglang/srt/managers/router/infer_batch.py +70 -33
  24. sglang/srt/managers/router/manager.py +7 -2
  25. sglang/srt/managers/router/model_rpc.py +116 -73
  26. sglang/srt/managers/router/model_runner.py +111 -167
  27. sglang/srt/managers/router/radix_cache.py +46 -38
  28. sglang/srt/managers/tokenizer_manager.py +56 -11
  29. sglang/srt/memory_pool.py +5 -14
  30. sglang/srt/model_config.py +7 -0
  31. sglang/srt/models/commandr.py +376 -0
  32. sglang/srt/models/dbrx.py +413 -0
  33. sglang/srt/models/dbrx_config.py +281 -0
  34. sglang/srt/models/gemma.py +22 -20
  35. sglang/srt/models/llama2.py +23 -21
  36. sglang/srt/models/llava.py +12 -10
  37. sglang/srt/models/mixtral.py +27 -25
  38. sglang/srt/models/qwen.py +23 -21
  39. sglang/srt/models/qwen2.py +23 -21
  40. sglang/srt/models/stablelm.py +20 -21
  41. sglang/srt/models/yivl.py +6 -5
  42. sglang/srt/openai_api_adapter.py +356 -0
  43. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
  44. sglang/srt/sampling_params.py +2 -0
  45. sglang/srt/server.py +68 -447
  46. sglang/srt/server_args.py +76 -49
  47. sglang/srt/utils.py +88 -32
  48. sglang/srt/weight_utils.py +402 -0
  49. sglang/test/test_programs.py +8 -7
  50. sglang/test/test_utils.py +195 -7
  51. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/METADATA +12 -14
  52. sglang-0.1.15.dist-info/RECORD +69 -0
  53. sglang-0.1.14.dist-info/RECORD +0 -64
  54. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
  55. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/WHEEL +0 -0
  56. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0
sglang/__init__.py CHANGED
@@ -1,4 +1,57 @@
1
- __version__ = "0.1.14"
1
+ __version__ = "0.1.15"
2
2
 
3
- from sglang.api import *
3
+ # SGL API Components
4
+ from sglang.api import (
5
+ Runtime,
6
+ assistant,
7
+ assistant_begin,
8
+ assistant_end,
9
+ flush_cache,
10
+ function,
11
+ gen,
12
+ gen_int,
13
+ gen_string,
14
+ get_server_args,
15
+ image,
16
+ select,
17
+ set_default_backend,
18
+ system,
19
+ user,
20
+ user_begin,
21
+ user_end,
22
+ )
23
+
24
+ # SGL Backends
25
+ from sglang.backend.anthropic import Anthropic
26
+ from sglang.backend.openai import OpenAI
27
+ from sglang.backend.runtime_endpoint import RuntimeEndpoint
28
+ from sglang.backend.vertexai import VertexAI
29
+
30
+ # Global Configurations
4
31
  from sglang.global_config import global_config
32
+
33
+ # public APIs management
34
+ __all__ = [
35
+ "global_config",
36
+ "Anthropic",
37
+ "OpenAI",
38
+ "RuntimeEndpoint",
39
+ "VertexAI",
40
+ "function",
41
+ "Runtime",
42
+ "set_default_backend",
43
+ "flush_cache",
44
+ "get_server_args",
45
+ "gen",
46
+ "gen_int",
47
+ "gen_string",
48
+ "image",
49
+ "select",
50
+ "system",
51
+ "user",
52
+ "assistant",
53
+ "user_begin",
54
+ "user_end",
55
+ "assistant_begin",
56
+ "assistant_end",
57
+ ]
sglang/api.py CHANGED
@@ -1,13 +1,10 @@
1
- """Public API"""
1
+ """Some Public API Definitions"""
2
2
 
3
+ import os
3
4
  import re
4
5
  from typing import Callable, List, Optional, Union
5
6
 
6
- from sglang.backend.anthropic import Anthropic
7
7
  from sglang.backend.base_backend import BaseBackend
8
- from sglang.backend.openai import OpenAI
9
- from sglang.backend.runtime_endpoint import RuntimeEndpoint
10
- from sglang.backend.vertexai import VertexAI
11
8
  from sglang.global_config import global_config
12
9
  from sglang.lang.ir import (
13
10
  SglExpr,
@@ -35,6 +32,7 @@ def function(
35
32
 
36
33
  def Runtime(*args, **kwargs):
37
34
  # Avoid importing unnecessary dependency
35
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
38
36
  from sglang.srt.server import Runtime
39
37
 
40
38
  return Runtime(*args, **kwargs)
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional, Union
2
2
 
3
3
  import numpy as np
4
+
4
5
  from sglang.backend.base_backend import BaseBackend
5
6
  from sglang.lang.chat_template import get_chat_template
6
7
  from sglang.lang.interpreter import StreamExecutor
@@ -13,7 +14,7 @@ except ImportError as e:
13
14
 
14
15
 
15
16
  class Anthropic(BaseBackend):
16
- def __init__(self, model_name):
17
+ def __init__(self, model_name, *args, **kwargs):
17
18
  super().__init__()
18
19
 
19
20
  if isinstance(anthropic, Exception):
@@ -21,6 +22,7 @@ class Anthropic(BaseBackend):
21
22
 
22
23
  self.model_name = model_name
23
24
  self.chat_template = get_chat_template("claude")
25
+ self.client = anthropic.Anthropic(*args, **kwargs)
24
26
 
25
27
  def get_chat_template(self):
26
28
  return self.chat_template
@@ -35,8 +37,14 @@ class Anthropic(BaseBackend):
35
37
  else:
36
38
  messages = [{"role": "user", "content": s.text_}]
37
39
 
38
- ret = anthropic.Anthropic().messages.create(
40
+ if messages and messages[0]["role"] == "system":
41
+ system = messages.pop(0)["content"]
42
+ else:
43
+ system = ""
44
+
45
+ ret = self.client.messages.create(
39
46
  model=self.model_name,
47
+ system=system,
40
48
  messages=messages,
41
49
  **sampling_params.to_anthropic_kwargs(),
42
50
  )
@@ -54,10 +62,16 @@ class Anthropic(BaseBackend):
54
62
  else:
55
63
  messages = [{"role": "user", "content": s.text_}]
56
64
 
57
- with anthropic.Anthropic().messages.stream(
65
+ if messages and messages[0]["role"] == "system":
66
+ system = messages.pop(0)["content"]
67
+ else:
68
+ system = ""
69
+
70
+ with self.client.messages.stream(
58
71
  model=self.model_name,
72
+ system=system,
59
73
  messages=messages,
60
74
  **sampling_params.to_anthropic_kwargs(),
61
75
  ) as stream:
62
76
  for text in stream.text_stream:
63
- yield text, {}
77
+ yield text, {}
sglang/backend/openai.py CHANGED
@@ -3,6 +3,7 @@ import time
3
3
  from typing import Callable, List, Optional, Union
4
4
 
5
5
  import numpy as np
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
8
9
  from sglang.lang.interpreter import StreamExecutor
@@ -227,7 +228,7 @@ class OpenAI(BaseBackend):
227
228
  prompt_tokens.append(ret_token)
228
229
 
229
230
  decision = choices[np.argmax(scores)]
230
- return decision, scores, scores
231
+ return decision, scores, None, None
231
232
 
232
233
 
233
234
  def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
@@ -3,6 +3,7 @@ from typing import Callable, List, Optional, Union
3
3
 
4
4
  import numpy as np
5
5
  import requests
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.global_config import global_config
8
9
  from sglang.lang.chat_template import get_chat_template_by_model_path
@@ -73,9 +74,11 @@ class RuntimeEndpoint(BaseBackend):
73
74
  assert res.status_code == 200
74
75
 
75
76
  def commit_lazy_operations(self, s: StreamExecutor):
77
+ data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
78
+ self._add_images(s, data)
76
79
  res = http_request(
77
80
  self.base_url + "/generate",
78
- json={"text": s.text_, "sampling_params": {"max_new_tokens": 0}},
81
+ json=data,
79
82
  auth_token=self.auth_token,
80
83
  api_key=self.api_key,
81
84
  verify=self.verify,
@@ -104,6 +107,7 @@ class RuntimeEndpoint(BaseBackend):
104
107
  "text": s.text_,
105
108
  "sampling_params": {
106
109
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
110
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
107
111
  **sampling_params.to_srt_kwargs(),
108
112
  },
109
113
  }
@@ -112,6 +116,7 @@ class RuntimeEndpoint(BaseBackend):
112
116
  "text": s.text_,
113
117
  "sampling_params": {
114
118
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
119
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
115
120
  "dtype": "int",
116
121
  **sampling_params.to_srt_kwargs(),
117
122
  },
@@ -142,6 +147,7 @@ class RuntimeEndpoint(BaseBackend):
142
147
  "text": s.text_,
143
148
  "sampling_params": {
144
149
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
150
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
145
151
  **sampling_params.to_srt_kwargs(),
146
152
  },
147
153
  }
@@ -150,6 +156,7 @@ class RuntimeEndpoint(BaseBackend):
150
156
  "text": s.text_,
151
157
  "sampling_params": {
152
158
  "skip_special_tokens": global_config.skip_special_tokens_in_output,
159
+ "spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
153
160
  "dtype": "int",
154
161
  **sampling_params.to_srt_kwargs(),
155
162
  },
@@ -224,13 +231,19 @@ class RuntimeEndpoint(BaseBackend):
224
231
  )
225
232
  assert res.status_code == 200
226
233
  obj = res.json()
227
- normalized_prompt_logprob = [
234
+ normalized_prompt_logprobs = [
228
235
  r["meta_info"]["normalized_prompt_logprob"] for r in obj
229
236
  ]
230
- prompt_logprob = [r["meta_info"]["prompt_logprob"] for r in obj]
237
+ decision = choices[np.argmax(normalized_prompt_logprobs)]
238
+ prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
239
+ decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
231
240
 
232
- decision = choices[np.argmax(normalized_prompt_logprob)]
233
- return decision, normalized_prompt_logprob, prompt_logprob
241
+ return (
242
+ decision,
243
+ normalized_prompt_logprobs,
244
+ prefill_token_logprobs,
245
+ decode_token_logprobs,
246
+ )
234
247
 
235
248
  def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
236
249
  res = http_request(
@@ -3,6 +3,7 @@ import warnings
3
3
  from typing import List, Optional, Union
4
4
 
5
5
  import numpy as np
6
+
6
7
  from sglang.backend.base_backend import BaseBackend
7
8
  from sglang.lang.chat_template import get_chat_template
8
9
  from sglang.lang.interpreter import StreamExecutor
sglang/global_config.py CHANGED
@@ -12,6 +12,7 @@ class GlobalConfig:
12
12
 
13
13
  # Output configs
14
14
  self.skip_special_tokens_in_output = True
15
+ self.spaces_between_special_tokens_in_out = True
15
16
 
16
17
  # Optimization configs
17
18
  self.eager_fill_image = False
@@ -162,6 +162,28 @@ register_chat_template(
162
162
  )
163
163
  )
164
164
 
165
+ register_chat_template(
166
+ ChatTemplate(
167
+ name="llama-3-instruct",
168
+ default_system_prompt=None,
169
+ role_prefix_and_suffix={
170
+ "system": (
171
+ "<|start_header_id|>system<|end_header_id|>\n\n",
172
+ "<|eot_id|>",
173
+ ),
174
+ "user": (
175
+ "<|start_header_id|>user<|end_header_id|>\n\n",
176
+ "<|eot_id|>",
177
+ ),
178
+ "assistant": (
179
+ "<|start_header_id|>assistant<|end_header_id|>\n\n",
180
+ "<|eot_id|>",
181
+ ),
182
+ },
183
+ stop_str=("<|eot_id|>",),
184
+ )
185
+ )
186
+
165
187
  # Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
166
188
  register_chat_template(
167
189
  ChatTemplate(
@@ -192,6 +214,44 @@ register_chat_template(
192
214
  )
193
215
  )
194
216
 
217
+ register_chat_template(
218
+ ChatTemplate(
219
+ name="dbrx-instruct",
220
+ default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
221
+ role_prefix_and_suffix={
222
+ "system": ("<|im_start|>system\n", "<|im_end|>"),
223
+ "user": ("\n<|im_start|>user\n", "<|im_end|>"),
224
+ "assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
225
+ },
226
+ stop_str=("<|im_end|>",),
227
+ )
228
+ )
229
+
230
+ register_chat_template(
231
+ ChatTemplate(
232
+ name="c4ai-command-r",
233
+ default_system_prompt=None,
234
+ role_prefix_and_suffix={
235
+ "system": (
236
+ "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
237
+ "<|END_OF_TURN_TOKEN|>",
238
+ ),
239
+ "user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
240
+ "assistant": (
241
+ "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
242
+ "<|END_OF_TURN_TOKEN|>",
243
+ ),
244
+ },
245
+ style=ChatTemplateStyle.PLAIN,
246
+ )
247
+ )
248
+
249
+
250
+ @register_chat_template_matching_function
251
+ def match_dbrx(model_path: str):
252
+ if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
253
+ return get_chat_template("dbrx-instruct")
254
+
195
255
 
196
256
  @register_chat_template_matching_function
197
257
  def match_vicuna(model_path: str):
@@ -214,6 +274,13 @@ def match_llama2_chat(model_path: str):
214
274
  return get_chat_template("llama-2-chat")
215
275
 
216
276
 
277
+ @register_chat_template_matching_function
278
+ def match_llama3_instruct(model_path: str):
279
+ model_path = model_path.lower()
280
+ if "llama-3" in model_path and "instruct" in model_path:
281
+ return get_chat_template("llama-3-instruct")
282
+
283
+
217
284
  @register_chat_template_matching_function
218
285
  def match_chat_ml(model_path: str):
219
286
  model_path = model_path.lower()
@@ -239,6 +306,13 @@ def match_gemma_it(model_path: str):
239
306
  return get_chat_template("gemma-it")
240
307
 
241
308
 
309
+ @register_chat_template_matching_function
310
+ def match_c4ai_command_r(model_path: str):
311
+ model_path = model_path.lower()
312
+ if "c4ai-command-r" in model_path:
313
+ return get_chat_template("c4ai-command-r")
314
+
315
+
242
316
  if __name__ == "__main__":
243
317
  messages = [
244
318
  {"role": "system", "content": None}, # None means default
@@ -1,6 +1,7 @@
1
1
  """The interpreter that executes SGL programs"""
2
2
 
3
3
  import asyncio
4
+ import contextvars
4
5
  import multiprocessing
5
6
  import queue
6
7
  import threading
@@ -10,6 +11,7 @@ from contextlib import contextmanager
10
11
  from typing import Any, Callable, Dict, List, Optional, Union
11
12
 
12
13
  import tqdm
14
+
13
15
  from sglang.global_config import global_config
14
16
  from sglang.lang.ir import (
15
17
  SglCommitLazy,
@@ -217,7 +219,13 @@ class StreamExecutor:
217
219
  self.use_thread = use_thread
218
220
  if self.use_thread:
219
221
  self.queue = queue.Queue()
220
- self.worker = threading.Thread(target=self._thread_worker_func)
222
+
223
+ def _run_worker_in_context():
224
+ self._thread_worker_func()
225
+
226
+ self.worker = threading.Thread(
227
+ target=contextvars.copy_context().run, args=(_run_worker_in_context,)
228
+ )
221
229
  self.worker.start()
222
230
 
223
231
  # For streaming
@@ -248,17 +256,24 @@ class StreamExecutor:
248
256
  def set_var(self, name, value):
249
257
  self.variables[name] = value
250
258
 
251
- def get_meta_info(self, name):
259
+ def get_meta_info(self, name, timeout=None):
252
260
  if name in self.variable_event:
253
- self.variable_event[name].wait()
261
+ got = self.variable_event[name].wait(timeout)
262
+ if not got:
263
+ raise TimeoutError(f"Timeout while waiting for event '{name}'")
254
264
  ret = self.meta_info.get(name, None)
255
265
  return ret
256
266
 
257
- def fork(self, number: int, position_ids_offset: Optional[List[int]] = None):
258
- self.submit(SglCommitLazy())
259
- self.sync()
267
+ def fork(
268
+ self,
269
+ size: int = 1,
270
+ position_ids_offset: Optional[List[int]] = None,
271
+ ):
272
+ if size > 1:
273
+ self.submit(SglCommitLazy())
260
274
 
261
- number = int(number)
275
+ self.sync()
276
+ size = int(size)
262
277
 
263
278
  exes = [
264
279
  StreamExecutor(
@@ -268,14 +283,15 @@ class StreamExecutor:
268
283
  self.chat_template,
269
284
  self.stream,
270
285
  )
271
- for _ in range(number)
286
+ for _ in range(size)
272
287
  ]
273
- for i in range(number):
288
+ for i in range(size):
274
289
  exes[i].variables = dict(self.variables)
275
290
  exes[i].text_ = str(self.text_)
276
291
  exes[i].messages_ = list(self.messages_)
277
292
  exes[i].cur_role = self.cur_role
278
293
  exes[i].fork_start_text_pos = len(self.text_)
294
+ exes[i].images_ = list(self.images_)
279
295
 
280
296
  return exes
281
297
 
@@ -454,15 +470,19 @@ class StreamExecutor:
454
470
  self.stream_var_event[name].set()
455
471
 
456
472
  def _execute_select(self, expr: SglSelect):
457
- decision, normalized_prompt_logprob, prompt_logprob = self.backend.select(
458
- self, expr.choices, expr.temperature
459
- )
473
+ (
474
+ decision,
475
+ normalized_prompt_logprobs,
476
+ prefill_token_logprobs,
477
+ decode_token_logprobs,
478
+ ) = self.backend.select(self, expr.choices, expr.temperature)
460
479
  if expr.name is not None:
461
480
  name = expr.name
462
481
  self.variables[name] = decision
463
482
  self.meta_info[name] = {
464
- "normalized_prompt_logprob": normalized_prompt_logprob,
465
- "prompt_logprob": prompt_logprob,
483
+ "normalized_prompt_logprobs": normalized_prompt_logprobs,
484
+ "prefill_token_logprobs": prefill_token_logprobs,
485
+ "decode_token_logprobs": decode_token_logprobs,
466
486
  }
467
487
  self.variable_event[name].set()
468
488
  self.text_ += decision
@@ -634,8 +654,12 @@ class ProgramState:
634
654
  yield
635
655
  self.stream_executor.submit(SglVarScopeEnd(name))
636
656
 
637
- def fork(self, number: int = 1, position_ids_offset: Optional[List[int]] = None):
638
- stream_executors = self.stream_executor.fork(number, position_ids_offset)
657
+ def fork(
658
+ self,
659
+ size: int = 1,
660
+ position_ids_offset: Optional[List[int]] = None,
661
+ ):
662
+ stream_executors = self.stream_executor.fork(size, position_ids_offset)
639
663
  states = [ProgramState(x) for x in stream_executors]
640
664
  state_group = ProgramStateGroup(states, self)
641
665
  return state_group
sglang/lang/tracer.py CHANGED
@@ -109,19 +109,21 @@ class TracerProgramState(ProgramState):
109
109
  ########### Public API ###########
110
110
  ##################################
111
111
 
112
- def fork(self, number: int, position_ids_offset: Optional[List[int]] = None):
112
+ def fork(self, size: int = 1, position_ids_offset: Optional[List[int]] = None):
113
+ assert (size >= 1)
114
+
113
115
  if self.only_trace_prefix:
114
116
  raise StopTracing()
115
117
 
116
- fork_node = SglFork(number)
118
+ fork_node = SglFork(size)
117
119
  fork_node.prev_node = self.last_node
118
120
 
119
121
  states = [
120
122
  TracerProgramState(self.backend, self.arguments, self.only_trace_prefix)
121
- for _ in range(number)
123
+ for _ in range(size)
122
124
  ]
123
125
 
124
- for i in range(number):
126
+ for i in range(size):
125
127
  node = SglGetForkItem(i)
126
128
  node.prev_node = fork_node
127
129
  states[i].last_node = node
sglang/launch_server.py CHANGED
@@ -2,10 +2,11 @@ import argparse
2
2
 
3
3
  from sglang.srt.server import ServerArgs, launch_server
4
4
 
5
+
5
6
  if __name__ == "__main__":
6
7
  parser = argparse.ArgumentParser()
7
8
  ServerArgs.add_cli_args(parser)
8
9
  args = parser.parse_args()
9
10
  server_args = ServerArgs.from_cli_args(args)
10
11
 
11
- launch_server(server_args, None)
12
+ launch_server(server_args, None)
@@ -7,6 +7,7 @@ class FSMCache(BaseCache):
7
7
  super().__init__(enable=enable)
8
8
 
9
9
  from importlib.metadata import version
10
+
10
11
  if version("outlines") >= "0.0.35":
11
12
  from transformers import AutoTokenizer
12
13
 
@@ -1,4 +1,5 @@
1
1
  import interegular
2
+
2
3
  from sglang.srt.constrained import FSMInfo, disk_cache, make_deterministic_fsm
3
4
  from sglang.srt.constrained.base_cache import BaseCache
4
5
 
@@ -4,7 +4,7 @@ import dataclasses
4
4
  from enum import IntEnum, auto
5
5
  from typing import Dict, List, Optional, Tuple, Union
6
6
 
7
- from sglang.srt.managers.openai_protocol import ChatCompletionRequest
7
+ from sglang.srt.openai_protocol import ChatCompletionRequest
8
8
 
9
9
 
10
10
  class SeparatorStyle(IntEnum):
@@ -400,7 +400,7 @@ register_conv_template(
400
400
  Conversation(
401
401
  name="chatml",
402
402
  system_template="<|im_start|>system\n{system_message}",
403
- system_message="You are an AI assistant.",
403
+ system_message="You are a helpful assistant.",
404
404
  roles=("<|im_start|>user", "<|im_start|>assistant"),
405
405
  sep_style=SeparatorStyle.CHATML,
406
406
  sep="<|im_end|>",
@@ -6,7 +6,6 @@ import warnings
6
6
  from typing import List, Optional, Tuple, Union
7
7
 
8
8
  from huggingface_hub import snapshot_download
9
- from sglang.srt.utils import is_multimodal_model
10
9
  from transformers import (
11
10
  AutoConfig,
12
11
  AutoProcessor,
@@ -15,6 +14,8 @@ from transformers import (
15
14
  PreTrainedTokenizerFast,
16
15
  )
17
16
 
17
+ from sglang.srt.utils import is_multimodal_model
18
+
18
19
 
19
20
  def download_from_hf(model_path: str):
20
21
  if os.path.exists(model_path):
@@ -3,6 +3,7 @@
3
3
  import torch
4
4
  import triton
5
5
  import triton.language as tl
6
+
6
7
  from sglang.srt.utils import wrap_kernel_launcher
7
8
 
8
9
  CUDA_CAPABILITY = torch.cuda.get_device_capability()
@@ -1,6 +1,7 @@
1
1
  import torch
2
2
  import triton
3
3
  import triton.language as tl
4
+
4
5
  from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
5
6
  from sglang.srt.utils import wrap_kernel_launcher
6
7