sglang 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +55 -2
- sglang/api.py +3 -5
- sglang/backend/anthropic.py +18 -4
- sglang/backend/openai.py +2 -1
- sglang/backend/runtime_endpoint.py +18 -5
- sglang/backend/vertexai.py +1 -0
- sglang/global_config.py +1 -0
- sglang/lang/chat_template.py +74 -0
- sglang/lang/interpreter.py +40 -16
- sglang/lang/tracer.py +6 -4
- sglang/launch_server.py +2 -1
- sglang/srt/constrained/fsm_cache.py +1 -0
- sglang/srt/constrained/jump_forward.py +1 -0
- sglang/srt/conversation.py +2 -2
- sglang/srt/hf_transformers_utils.py +2 -1
- sglang/srt/layers/context_flashattention_nopad.py +1 -0
- sglang/srt/layers/extend_attention.py +1 -0
- sglang/srt/layers/logits_processor.py +114 -54
- sglang/srt/layers/radix_attention.py +2 -1
- sglang/srt/layers/token_attention.py +1 -0
- sglang/srt/managers/detokenizer_manager.py +5 -1
- sglang/srt/managers/io_struct.py +12 -0
- sglang/srt/managers/router/infer_batch.py +70 -33
- sglang/srt/managers/router/manager.py +7 -2
- sglang/srt/managers/router/model_rpc.py +116 -73
- sglang/srt/managers/router/model_runner.py +111 -167
- sglang/srt/managers/router/radix_cache.py +46 -38
- sglang/srt/managers/tokenizer_manager.py +56 -11
- sglang/srt/memory_pool.py +5 -14
- sglang/srt/model_config.py +7 -0
- sglang/srt/models/commandr.py +376 -0
- sglang/srt/models/dbrx.py +413 -0
- sglang/srt/models/dbrx_config.py +281 -0
- sglang/srt/models/gemma.py +22 -20
- sglang/srt/models/llama2.py +23 -21
- sglang/srt/models/llava.py +12 -10
- sglang/srt/models/mixtral.py +27 -25
- sglang/srt/models/qwen.py +23 -21
- sglang/srt/models/qwen2.py +23 -21
- sglang/srt/models/stablelm.py +20 -21
- sglang/srt/models/yivl.py +6 -5
- sglang/srt/openai_api_adapter.py +356 -0
- sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +68 -447
- sglang/srt/server_args.py +76 -49
- sglang/srt/utils.py +88 -32
- sglang/srt/weight_utils.py +402 -0
- sglang/test/test_programs.py +8 -7
- sglang/test/test_utils.py +195 -7
- {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/METADATA +12 -14
- sglang-0.1.15.dist-info/RECORD +69 -0
- sglang-0.1.14.dist-info/RECORD +0 -64
- {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
- {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/WHEEL +0 -0
- {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -1,4 +1,57 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.15"
|
2
2
|
|
3
|
-
|
3
|
+
# SGL API Components
|
4
|
+
from sglang.api import (
|
5
|
+
Runtime,
|
6
|
+
assistant,
|
7
|
+
assistant_begin,
|
8
|
+
assistant_end,
|
9
|
+
flush_cache,
|
10
|
+
function,
|
11
|
+
gen,
|
12
|
+
gen_int,
|
13
|
+
gen_string,
|
14
|
+
get_server_args,
|
15
|
+
image,
|
16
|
+
select,
|
17
|
+
set_default_backend,
|
18
|
+
system,
|
19
|
+
user,
|
20
|
+
user_begin,
|
21
|
+
user_end,
|
22
|
+
)
|
23
|
+
|
24
|
+
# SGL Backends
|
25
|
+
from sglang.backend.anthropic import Anthropic
|
26
|
+
from sglang.backend.openai import OpenAI
|
27
|
+
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
28
|
+
from sglang.backend.vertexai import VertexAI
|
29
|
+
|
30
|
+
# Global Configurations
|
4
31
|
from sglang.global_config import global_config
|
32
|
+
|
33
|
+
# public APIs management
|
34
|
+
__all__ = [
|
35
|
+
"global_config",
|
36
|
+
"Anthropic",
|
37
|
+
"OpenAI",
|
38
|
+
"RuntimeEndpoint",
|
39
|
+
"VertexAI",
|
40
|
+
"function",
|
41
|
+
"Runtime",
|
42
|
+
"set_default_backend",
|
43
|
+
"flush_cache",
|
44
|
+
"get_server_args",
|
45
|
+
"gen",
|
46
|
+
"gen_int",
|
47
|
+
"gen_string",
|
48
|
+
"image",
|
49
|
+
"select",
|
50
|
+
"system",
|
51
|
+
"user",
|
52
|
+
"assistant",
|
53
|
+
"user_begin",
|
54
|
+
"user_end",
|
55
|
+
"assistant_begin",
|
56
|
+
"assistant_end",
|
57
|
+
]
|
sglang/api.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1
|
-
"""Public API"""
|
1
|
+
"""Some Public API Definitions"""
|
2
2
|
|
3
|
+
import os
|
3
4
|
import re
|
4
5
|
from typing import Callable, List, Optional, Union
|
5
6
|
|
6
|
-
from sglang.backend.anthropic import Anthropic
|
7
7
|
from sglang.backend.base_backend import BaseBackend
|
8
|
-
from sglang.backend.openai import OpenAI
|
9
|
-
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
10
|
-
from sglang.backend.vertexai import VertexAI
|
11
8
|
from sglang.global_config import global_config
|
12
9
|
from sglang.lang.ir import (
|
13
10
|
SglExpr,
|
@@ -35,6 +32,7 @@ def function(
|
|
35
32
|
|
36
33
|
def Runtime(*args, **kwargs):
|
37
34
|
# Avoid importing unnecessary dependency
|
35
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
38
36
|
from sglang.srt.server import Runtime
|
39
37
|
|
40
38
|
return Runtime(*args, **kwargs)
|
sglang/backend/anthropic.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import List, Optional, Union
|
2
2
|
|
3
3
|
import numpy as np
|
4
|
+
|
4
5
|
from sglang.backend.base_backend import BaseBackend
|
5
6
|
from sglang.lang.chat_template import get_chat_template
|
6
7
|
from sglang.lang.interpreter import StreamExecutor
|
@@ -13,7 +14,7 @@ except ImportError as e:
|
|
13
14
|
|
14
15
|
|
15
16
|
class Anthropic(BaseBackend):
|
16
|
-
def __init__(self, model_name):
|
17
|
+
def __init__(self, model_name, *args, **kwargs):
|
17
18
|
super().__init__()
|
18
19
|
|
19
20
|
if isinstance(anthropic, Exception):
|
@@ -21,6 +22,7 @@ class Anthropic(BaseBackend):
|
|
21
22
|
|
22
23
|
self.model_name = model_name
|
23
24
|
self.chat_template = get_chat_template("claude")
|
25
|
+
self.client = anthropic.Anthropic(*args, **kwargs)
|
24
26
|
|
25
27
|
def get_chat_template(self):
|
26
28
|
return self.chat_template
|
@@ -35,8 +37,14 @@ class Anthropic(BaseBackend):
|
|
35
37
|
else:
|
36
38
|
messages = [{"role": "user", "content": s.text_}]
|
37
39
|
|
38
|
-
|
40
|
+
if messages and messages[0]["role"] == "system":
|
41
|
+
system = messages.pop(0)["content"]
|
42
|
+
else:
|
43
|
+
system = ""
|
44
|
+
|
45
|
+
ret = self.client.messages.create(
|
39
46
|
model=self.model_name,
|
47
|
+
system=system,
|
40
48
|
messages=messages,
|
41
49
|
**sampling_params.to_anthropic_kwargs(),
|
42
50
|
)
|
@@ -54,10 +62,16 @@ class Anthropic(BaseBackend):
|
|
54
62
|
else:
|
55
63
|
messages = [{"role": "user", "content": s.text_}]
|
56
64
|
|
57
|
-
|
65
|
+
if messages and messages[0]["role"] == "system":
|
66
|
+
system = messages.pop(0)["content"]
|
67
|
+
else:
|
68
|
+
system = ""
|
69
|
+
|
70
|
+
with self.client.messages.stream(
|
58
71
|
model=self.model_name,
|
72
|
+
system=system,
|
59
73
|
messages=messages,
|
60
74
|
**sampling_params.to_anthropic_kwargs(),
|
61
75
|
) as stream:
|
62
76
|
for text in stream.text_stream:
|
63
|
-
yield text, {}
|
77
|
+
yield text, {}
|
sglang/backend/openai.py
CHANGED
@@ -3,6 +3,7 @@ import time
|
|
3
3
|
from typing import Callable, List, Optional, Union
|
4
4
|
|
5
5
|
import numpy as np
|
6
|
+
|
6
7
|
from sglang.backend.base_backend import BaseBackend
|
7
8
|
from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path
|
8
9
|
from sglang.lang.interpreter import StreamExecutor
|
@@ -227,7 +228,7 @@ class OpenAI(BaseBackend):
|
|
227
228
|
prompt_tokens.append(ret_token)
|
228
229
|
|
229
230
|
decision = choices[np.argmax(scores)]
|
230
|
-
return decision, scores,
|
231
|
+
return decision, scores, None, None
|
231
232
|
|
232
233
|
|
233
234
|
def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
@@ -3,6 +3,7 @@ from typing import Callable, List, Optional, Union
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import requests
|
6
|
+
|
6
7
|
from sglang.backend.base_backend import BaseBackend
|
7
8
|
from sglang.global_config import global_config
|
8
9
|
from sglang.lang.chat_template import get_chat_template_by_model_path
|
@@ -73,9 +74,11 @@ class RuntimeEndpoint(BaseBackend):
|
|
73
74
|
assert res.status_code == 200
|
74
75
|
|
75
76
|
def commit_lazy_operations(self, s: StreamExecutor):
|
77
|
+
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
|
78
|
+
self._add_images(s, data)
|
76
79
|
res = http_request(
|
77
80
|
self.base_url + "/generate",
|
78
|
-
json=
|
81
|
+
json=data,
|
79
82
|
auth_token=self.auth_token,
|
80
83
|
api_key=self.api_key,
|
81
84
|
verify=self.verify,
|
@@ -104,6 +107,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
104
107
|
"text": s.text_,
|
105
108
|
"sampling_params": {
|
106
109
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
110
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
107
111
|
**sampling_params.to_srt_kwargs(),
|
108
112
|
},
|
109
113
|
}
|
@@ -112,6 +116,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
112
116
|
"text": s.text_,
|
113
117
|
"sampling_params": {
|
114
118
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
119
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
115
120
|
"dtype": "int",
|
116
121
|
**sampling_params.to_srt_kwargs(),
|
117
122
|
},
|
@@ -142,6 +147,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
142
147
|
"text": s.text_,
|
143
148
|
"sampling_params": {
|
144
149
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
150
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
145
151
|
**sampling_params.to_srt_kwargs(),
|
146
152
|
},
|
147
153
|
}
|
@@ -150,6 +156,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
150
156
|
"text": s.text_,
|
151
157
|
"sampling_params": {
|
152
158
|
"skip_special_tokens": global_config.skip_special_tokens_in_output,
|
159
|
+
"spaces_between_special_tokens": global_config.spaces_between_special_tokens_in_out,
|
153
160
|
"dtype": "int",
|
154
161
|
**sampling_params.to_srt_kwargs(),
|
155
162
|
},
|
@@ -224,13 +231,19 @@ class RuntimeEndpoint(BaseBackend):
|
|
224
231
|
)
|
225
232
|
assert res.status_code == 200
|
226
233
|
obj = res.json()
|
227
|
-
|
234
|
+
normalized_prompt_logprobs = [
|
228
235
|
r["meta_info"]["normalized_prompt_logprob"] for r in obj
|
229
236
|
]
|
230
|
-
|
237
|
+
decision = choices[np.argmax(normalized_prompt_logprobs)]
|
238
|
+
prefill_token_logprobs = [r["meta_info"]["prefill_token_logprobs"] for r in obj]
|
239
|
+
decode_token_logprobs = [r["meta_info"]["decode_token_logprobs"] for r in obj]
|
231
240
|
|
232
|
-
|
233
|
-
|
241
|
+
return (
|
242
|
+
decision,
|
243
|
+
normalized_prompt_logprobs,
|
244
|
+
prefill_token_logprobs,
|
245
|
+
decode_token_logprobs,
|
246
|
+
)
|
234
247
|
|
235
248
|
def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
|
236
249
|
res = http_request(
|
sglang/backend/vertexai.py
CHANGED
sglang/global_config.py
CHANGED
sglang/lang/chat_template.py
CHANGED
@@ -162,6 +162,28 @@ register_chat_template(
|
|
162
162
|
)
|
163
163
|
)
|
164
164
|
|
165
|
+
register_chat_template(
|
166
|
+
ChatTemplate(
|
167
|
+
name="llama-3-instruct",
|
168
|
+
default_system_prompt=None,
|
169
|
+
role_prefix_and_suffix={
|
170
|
+
"system": (
|
171
|
+
"<|start_header_id|>system<|end_header_id|>\n\n",
|
172
|
+
"<|eot_id|>",
|
173
|
+
),
|
174
|
+
"user": (
|
175
|
+
"<|start_header_id|>user<|end_header_id|>\n\n",
|
176
|
+
"<|eot_id|>",
|
177
|
+
),
|
178
|
+
"assistant": (
|
179
|
+
"<|start_header_id|>assistant<|end_header_id|>\n\n",
|
180
|
+
"<|eot_id|>",
|
181
|
+
),
|
182
|
+
},
|
183
|
+
stop_str=("<|eot_id|>",),
|
184
|
+
)
|
185
|
+
)
|
186
|
+
|
165
187
|
# Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
|
166
188
|
register_chat_template(
|
167
189
|
ChatTemplate(
|
@@ -192,6 +214,44 @@ register_chat_template(
|
|
192
214
|
)
|
193
215
|
)
|
194
216
|
|
217
|
+
register_chat_template(
|
218
|
+
ChatTemplate(
|
219
|
+
name="dbrx-instruct",
|
220
|
+
default_system_prompt="You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY.",
|
221
|
+
role_prefix_and_suffix={
|
222
|
+
"system": ("<|im_start|>system\n", "<|im_end|>"),
|
223
|
+
"user": ("\n<|im_start|>user\n", "<|im_end|>"),
|
224
|
+
"assistant": ("\n<|im_start|>assistant\n", "<|im_end|>"),
|
225
|
+
},
|
226
|
+
stop_str=("<|im_end|>",),
|
227
|
+
)
|
228
|
+
)
|
229
|
+
|
230
|
+
register_chat_template(
|
231
|
+
ChatTemplate(
|
232
|
+
name="c4ai-command-r",
|
233
|
+
default_system_prompt=None,
|
234
|
+
role_prefix_and_suffix={
|
235
|
+
"system": (
|
236
|
+
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
|
237
|
+
"<|END_OF_TURN_TOKEN|>",
|
238
|
+
),
|
239
|
+
"user": ("<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|END_OF_TURN_TOKEN|>"),
|
240
|
+
"assistant": (
|
241
|
+
"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
242
|
+
"<|END_OF_TURN_TOKEN|>",
|
243
|
+
),
|
244
|
+
},
|
245
|
+
style=ChatTemplateStyle.PLAIN,
|
246
|
+
)
|
247
|
+
)
|
248
|
+
|
249
|
+
|
250
|
+
@register_chat_template_matching_function
|
251
|
+
def match_dbrx(model_path: str):
|
252
|
+
if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
|
253
|
+
return get_chat_template("dbrx-instruct")
|
254
|
+
|
195
255
|
|
196
256
|
@register_chat_template_matching_function
|
197
257
|
def match_vicuna(model_path: str):
|
@@ -214,6 +274,13 @@ def match_llama2_chat(model_path: str):
|
|
214
274
|
return get_chat_template("llama-2-chat")
|
215
275
|
|
216
276
|
|
277
|
+
@register_chat_template_matching_function
|
278
|
+
def match_llama3_instruct(model_path: str):
|
279
|
+
model_path = model_path.lower()
|
280
|
+
if "llama-3" in model_path and "instruct" in model_path:
|
281
|
+
return get_chat_template("llama-3-instruct")
|
282
|
+
|
283
|
+
|
217
284
|
@register_chat_template_matching_function
|
218
285
|
def match_chat_ml(model_path: str):
|
219
286
|
model_path = model_path.lower()
|
@@ -239,6 +306,13 @@ def match_gemma_it(model_path: str):
|
|
239
306
|
return get_chat_template("gemma-it")
|
240
307
|
|
241
308
|
|
309
|
+
@register_chat_template_matching_function
|
310
|
+
def match_c4ai_command_r(model_path: str):
|
311
|
+
model_path = model_path.lower()
|
312
|
+
if "c4ai-command-r" in model_path:
|
313
|
+
return get_chat_template("c4ai-command-r")
|
314
|
+
|
315
|
+
|
242
316
|
if __name__ == "__main__":
|
243
317
|
messages = [
|
244
318
|
{"role": "system", "content": None}, # None means default
|
sglang/lang/interpreter.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""The interpreter that executes SGL programs"""
|
2
2
|
|
3
3
|
import asyncio
|
4
|
+
import contextvars
|
4
5
|
import multiprocessing
|
5
6
|
import queue
|
6
7
|
import threading
|
@@ -10,6 +11,7 @@ from contextlib import contextmanager
|
|
10
11
|
from typing import Any, Callable, Dict, List, Optional, Union
|
11
12
|
|
12
13
|
import tqdm
|
14
|
+
|
13
15
|
from sglang.global_config import global_config
|
14
16
|
from sglang.lang.ir import (
|
15
17
|
SglCommitLazy,
|
@@ -217,7 +219,13 @@ class StreamExecutor:
|
|
217
219
|
self.use_thread = use_thread
|
218
220
|
if self.use_thread:
|
219
221
|
self.queue = queue.Queue()
|
220
|
-
|
222
|
+
|
223
|
+
def _run_worker_in_context():
|
224
|
+
self._thread_worker_func()
|
225
|
+
|
226
|
+
self.worker = threading.Thread(
|
227
|
+
target=contextvars.copy_context().run, args=(_run_worker_in_context,)
|
228
|
+
)
|
221
229
|
self.worker.start()
|
222
230
|
|
223
231
|
# For streaming
|
@@ -248,17 +256,24 @@ class StreamExecutor:
|
|
248
256
|
def set_var(self, name, value):
|
249
257
|
self.variables[name] = value
|
250
258
|
|
251
|
-
def get_meta_info(self, name):
|
259
|
+
def get_meta_info(self, name, timeout=None):
|
252
260
|
if name in self.variable_event:
|
253
|
-
self.variable_event[name].wait()
|
261
|
+
got = self.variable_event[name].wait(timeout)
|
262
|
+
if not got:
|
263
|
+
raise TimeoutError(f"Timeout while waiting for event '{name}'")
|
254
264
|
ret = self.meta_info.get(name, None)
|
255
265
|
return ret
|
256
266
|
|
257
|
-
def fork(
|
258
|
-
self
|
259
|
-
|
267
|
+
def fork(
|
268
|
+
self,
|
269
|
+
size: int = 1,
|
270
|
+
position_ids_offset: Optional[List[int]] = None,
|
271
|
+
):
|
272
|
+
if size > 1:
|
273
|
+
self.submit(SglCommitLazy())
|
260
274
|
|
261
|
-
|
275
|
+
self.sync()
|
276
|
+
size = int(size)
|
262
277
|
|
263
278
|
exes = [
|
264
279
|
StreamExecutor(
|
@@ -268,14 +283,15 @@ class StreamExecutor:
|
|
268
283
|
self.chat_template,
|
269
284
|
self.stream,
|
270
285
|
)
|
271
|
-
for _ in range(
|
286
|
+
for _ in range(size)
|
272
287
|
]
|
273
|
-
for i in range(
|
288
|
+
for i in range(size):
|
274
289
|
exes[i].variables = dict(self.variables)
|
275
290
|
exes[i].text_ = str(self.text_)
|
276
291
|
exes[i].messages_ = list(self.messages_)
|
277
292
|
exes[i].cur_role = self.cur_role
|
278
293
|
exes[i].fork_start_text_pos = len(self.text_)
|
294
|
+
exes[i].images_ = list(self.images_)
|
279
295
|
|
280
296
|
return exes
|
281
297
|
|
@@ -454,15 +470,19 @@ class StreamExecutor:
|
|
454
470
|
self.stream_var_event[name].set()
|
455
471
|
|
456
472
|
def _execute_select(self, expr: SglSelect):
|
457
|
-
|
458
|
-
|
459
|
-
|
473
|
+
(
|
474
|
+
decision,
|
475
|
+
normalized_prompt_logprobs,
|
476
|
+
prefill_token_logprobs,
|
477
|
+
decode_token_logprobs,
|
478
|
+
) = self.backend.select(self, expr.choices, expr.temperature)
|
460
479
|
if expr.name is not None:
|
461
480
|
name = expr.name
|
462
481
|
self.variables[name] = decision
|
463
482
|
self.meta_info[name] = {
|
464
|
-
"
|
465
|
-
"
|
483
|
+
"normalized_prompt_logprobs": normalized_prompt_logprobs,
|
484
|
+
"prefill_token_logprobs": prefill_token_logprobs,
|
485
|
+
"decode_token_logprobs": decode_token_logprobs,
|
466
486
|
}
|
467
487
|
self.variable_event[name].set()
|
468
488
|
self.text_ += decision
|
@@ -634,8 +654,12 @@ class ProgramState:
|
|
634
654
|
yield
|
635
655
|
self.stream_executor.submit(SglVarScopeEnd(name))
|
636
656
|
|
637
|
-
def fork(
|
638
|
-
|
657
|
+
def fork(
|
658
|
+
self,
|
659
|
+
size: int = 1,
|
660
|
+
position_ids_offset: Optional[List[int]] = None,
|
661
|
+
):
|
662
|
+
stream_executors = self.stream_executor.fork(size, position_ids_offset)
|
639
663
|
states = [ProgramState(x) for x in stream_executors]
|
640
664
|
state_group = ProgramStateGroup(states, self)
|
641
665
|
return state_group
|
sglang/lang/tracer.py
CHANGED
@@ -109,19 +109,21 @@ class TracerProgramState(ProgramState):
|
|
109
109
|
########### Public API ###########
|
110
110
|
##################################
|
111
111
|
|
112
|
-
def fork(self,
|
112
|
+
def fork(self, size: int = 1, position_ids_offset: Optional[List[int]] = None):
|
113
|
+
assert (size >= 1)
|
114
|
+
|
113
115
|
if self.only_trace_prefix:
|
114
116
|
raise StopTracing()
|
115
117
|
|
116
|
-
fork_node = SglFork(
|
118
|
+
fork_node = SglFork(size)
|
117
119
|
fork_node.prev_node = self.last_node
|
118
120
|
|
119
121
|
states = [
|
120
122
|
TracerProgramState(self.backend, self.arguments, self.only_trace_prefix)
|
121
|
-
for _ in range(
|
123
|
+
for _ in range(size)
|
122
124
|
]
|
123
125
|
|
124
|
-
for i in range(
|
126
|
+
for i in range(size):
|
125
127
|
node = SglGetForkItem(i)
|
126
128
|
node.prev_node = fork_node
|
127
129
|
states[i].last_node = node
|
sglang/launch_server.py
CHANGED
@@ -2,10 +2,11 @@ import argparse
|
|
2
2
|
|
3
3
|
from sglang.srt.server import ServerArgs, launch_server
|
4
4
|
|
5
|
+
|
5
6
|
if __name__ == "__main__":
|
6
7
|
parser = argparse.ArgumentParser()
|
7
8
|
ServerArgs.add_cli_args(parser)
|
8
9
|
args = parser.parse_args()
|
9
10
|
server_args = ServerArgs.from_cli_args(args)
|
10
11
|
|
11
|
-
launch_server(server_args, None)
|
12
|
+
launch_server(server_args, None)
|
sglang/srt/conversation.py
CHANGED
@@ -4,7 +4,7 @@ import dataclasses
|
|
4
4
|
from enum import IntEnum, auto
|
5
5
|
from typing import Dict, List, Optional, Tuple, Union
|
6
6
|
|
7
|
-
from sglang.srt.
|
7
|
+
from sglang.srt.openai_protocol import ChatCompletionRequest
|
8
8
|
|
9
9
|
|
10
10
|
class SeparatorStyle(IntEnum):
|
@@ -400,7 +400,7 @@ register_conv_template(
|
|
400
400
|
Conversation(
|
401
401
|
name="chatml",
|
402
402
|
system_template="<|im_start|>system\n{system_message}",
|
403
|
-
system_message="You are
|
403
|
+
system_message="You are a helpful assistant.",
|
404
404
|
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
405
405
|
sep_style=SeparatorStyle.CHATML,
|
406
406
|
sep="<|im_end|>",
|
@@ -6,7 +6,6 @@ import warnings
|
|
6
6
|
from typing import List, Optional, Tuple, Union
|
7
7
|
|
8
8
|
from huggingface_hub import snapshot_download
|
9
|
-
from sglang.srt.utils import is_multimodal_model
|
10
9
|
from transformers import (
|
11
10
|
AutoConfig,
|
12
11
|
AutoProcessor,
|
@@ -15,6 +14,8 @@ from transformers import (
|
|
15
14
|
PreTrainedTokenizerFast,
|
16
15
|
)
|
17
16
|
|
17
|
+
from sglang.srt.utils import is_multimodal_model
|
18
|
+
|
18
19
|
|
19
20
|
def download_from_hf(model_path: str):
|
20
21
|
if os.path.exists(model_path):
|