sglang 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +5 -1
- sglang/api.py +8 -3
- sglang/backend/anthropic.py +1 -1
- sglang/backend/litellm.py +90 -0
- sglang/backend/openai.py +148 -12
- sglang/backend/runtime_endpoint.py +18 -10
- sglang/global_config.py +11 -1
- sglang/lang/chat_template.py +9 -2
- sglang/lang/interpreter.py +161 -81
- sglang/lang/ir.py +29 -11
- sglang/lang/tracer.py +1 -1
- sglang/launch_server.py +1 -2
- sglang/launch_server_llavavid.py +31 -0
- sglang/srt/constrained/fsm_cache.py +3 -0
- sglang/srt/flush_cache.py +16 -0
- sglang/srt/hf_transformers_utils.py +83 -2
- sglang/srt/layers/extend_attention.py +17 -0
- sglang/srt/layers/fused_moe.py +485 -0
- sglang/srt/layers/logits_processor.py +12 -7
- sglang/srt/layers/radix_attention.py +10 -3
- sglang/srt/layers/token_attention.py +16 -1
- sglang/srt/managers/controller/dp_worker.py +110 -0
- sglang/srt/managers/controller/infer_batch.py +619 -0
- sglang/srt/managers/controller/manager_multi.py +191 -0
- sglang/srt/managers/controller/manager_single.py +97 -0
- sglang/srt/managers/controller/model_runner.py +462 -0
- sglang/srt/managers/controller/radix_cache.py +267 -0
- sglang/srt/managers/controller/schedule_heuristic.py +59 -0
- sglang/srt/managers/controller/tp_worker.py +791 -0
- sglang/srt/managers/detokenizer_manager.py +45 -45
- sglang/srt/managers/io_struct.py +26 -10
- sglang/srt/managers/router/infer_batch.py +130 -74
- sglang/srt/managers/router/manager.py +7 -9
- sglang/srt/managers/router/model_rpc.py +224 -135
- sglang/srt/managers/router/model_runner.py +94 -107
- sglang/srt/managers/router/radix_cache.py +54 -18
- sglang/srt/managers/router/scheduler.py +23 -34
- sglang/srt/managers/tokenizer_manager.py +183 -88
- sglang/srt/model_config.py +5 -2
- sglang/srt/models/commandr.py +15 -22
- sglang/srt/models/dbrx.py +22 -29
- sglang/srt/models/gemma.py +14 -24
- sglang/srt/models/grok.py +671 -0
- sglang/srt/models/llama2.py +24 -23
- sglang/srt/models/llava.py +85 -25
- sglang/srt/models/llavavid.py +298 -0
- sglang/srt/models/mixtral.py +254 -130
- sglang/srt/models/mixtral_quant.py +373 -0
- sglang/srt/models/qwen.py +28 -25
- sglang/srt/models/qwen2.py +17 -22
- sglang/srt/models/stablelm.py +21 -26
- sglang/srt/models/yivl.py +17 -25
- sglang/srt/openai_api_adapter.py +140 -95
- sglang/srt/openai_protocol.py +10 -1
- sglang/srt/server.py +101 -52
- sglang/srt/server_args.py +59 -11
- sglang/srt/utils.py +242 -75
- sglang/test/test_programs.py +44 -0
- sglang/test/test_utils.py +32 -1
- sglang/utils.py +95 -26
- {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/METADATA +23 -13
- sglang-0.1.17.dist-info/RECORD +81 -0
- sglang/srt/backend_config.py +0 -13
- sglang/srt/models/dbrx_config.py +0 -281
- sglang/srt/weight_utils.py +0 -402
- sglang-0.1.15.dist-info/RECORD +0 -69
- {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/LICENSE +0 -0
- {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/WHEEL +0 -0
- {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.17"
|
2
2
|
|
3
3
|
# SGL API Components
|
4
4
|
from sglang.api import (
|
@@ -19,6 +19,7 @@ from sglang.api import (
|
|
19
19
|
user,
|
20
20
|
user_begin,
|
21
21
|
user_end,
|
22
|
+
video,
|
22
23
|
)
|
23
24
|
|
24
25
|
# SGL Backends
|
@@ -26,6 +27,7 @@ from sglang.backend.anthropic import Anthropic
|
|
26
27
|
from sglang.backend.openai import OpenAI
|
27
28
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
28
29
|
from sglang.backend.vertexai import VertexAI
|
30
|
+
from sglang.backend.litellm import LiteLLM
|
29
31
|
|
30
32
|
# Global Configurations
|
31
33
|
from sglang.global_config import global_config
|
@@ -34,6 +36,7 @@ from sglang.global_config import global_config
|
|
34
36
|
__all__ = [
|
35
37
|
"global_config",
|
36
38
|
"Anthropic",
|
39
|
+
"LiteLLM",
|
37
40
|
"OpenAI",
|
38
41
|
"RuntimeEndpoint",
|
39
42
|
"VertexAI",
|
@@ -46,6 +49,7 @@ __all__ = [
|
|
46
49
|
"gen_int",
|
47
50
|
"gen_string",
|
48
51
|
"image",
|
52
|
+
"video",
|
49
53
|
"select",
|
50
54
|
"system",
|
51
55
|
"user",
|
sglang/api.py
CHANGED
@@ -15,17 +15,18 @@ from sglang.lang.ir import (
|
|
15
15
|
SglRoleBegin,
|
16
16
|
SglRoleEnd,
|
17
17
|
SglSelect,
|
18
|
+
SglVideo,
|
18
19
|
)
|
19
20
|
|
20
21
|
|
21
22
|
def function(
|
22
|
-
func: Optional[Callable] = None,
|
23
|
+
func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
|
23
24
|
):
|
24
25
|
if func:
|
25
|
-
return SglFunction(func,
|
26
|
+
return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
|
26
27
|
|
27
28
|
def decorator(func):
|
28
|
-
return SglFunction(func,
|
29
|
+
return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
|
29
30
|
|
30
31
|
return decorator
|
31
32
|
|
@@ -151,6 +152,10 @@ def image(expr: SglExpr):
|
|
151
152
|
return SglImage(expr)
|
152
153
|
|
153
154
|
|
155
|
+
def video(path: str, num_frames: int):
|
156
|
+
return SglVideo(path, num_frames)
|
157
|
+
|
158
|
+
|
154
159
|
def select(
|
155
160
|
name: Optional[str] = None,
|
156
161
|
choices: List[str] = None,
|
sglang/backend/anthropic.py
CHANGED
@@ -0,0 +1,90 @@
|
|
1
|
+
from typing import Mapping, Optional
|
2
|
+
|
3
|
+
from sglang.backend.base_backend import BaseBackend
|
4
|
+
from sglang.lang.chat_template import get_chat_template_by_model_path
|
5
|
+
from sglang.lang.interpreter import StreamExecutor
|
6
|
+
from sglang.lang.ir import SglSamplingParams
|
7
|
+
|
8
|
+
try:
|
9
|
+
import litellm
|
10
|
+
except ImportError as e:
|
11
|
+
litellm = e
|
12
|
+
litellm.num_retries = 1
|
13
|
+
|
14
|
+
|
15
|
+
class LiteLLM(BaseBackend):
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
model_name,
|
20
|
+
chat_template=None,
|
21
|
+
api_key=None,
|
22
|
+
organization: Optional[str] = None,
|
23
|
+
base_url: Optional[str] = None,
|
24
|
+
timeout: Optional[float] = 600,
|
25
|
+
max_retries: Optional[int] = litellm.num_retries,
|
26
|
+
default_headers: Optional[Mapping[str, str]] = None,
|
27
|
+
):
|
28
|
+
super().__init__()
|
29
|
+
|
30
|
+
if isinstance(litellm, Exception):
|
31
|
+
raise litellm
|
32
|
+
|
33
|
+
self.model_name = model_name
|
34
|
+
|
35
|
+
self.chat_template = chat_template or get_chat_template_by_model_path(
|
36
|
+
model_name)
|
37
|
+
|
38
|
+
self.client_params = {
|
39
|
+
"api_key": api_key,
|
40
|
+
"organization": organization,
|
41
|
+
"base_url": base_url,
|
42
|
+
"timeout": timeout,
|
43
|
+
"max_retries": max_retries,
|
44
|
+
"default_headers": default_headers,
|
45
|
+
}
|
46
|
+
|
47
|
+
def get_chat_template(self):
|
48
|
+
return self.chat_template
|
49
|
+
|
50
|
+
def generate(
|
51
|
+
self,
|
52
|
+
s: StreamExecutor,
|
53
|
+
sampling_params: SglSamplingParams,
|
54
|
+
):
|
55
|
+
if s.messages_:
|
56
|
+
messages = s.messages_
|
57
|
+
else:
|
58
|
+
messages = [{"role": "user", "content": s.text_}]
|
59
|
+
|
60
|
+
ret = litellm.completion(
|
61
|
+
model=self.model_name,
|
62
|
+
messages=messages,
|
63
|
+
**self.client_params,
|
64
|
+
**sampling_params.to_anthropic_kwargs(),
|
65
|
+
)
|
66
|
+
comp = ret.choices[0].message.content
|
67
|
+
|
68
|
+
return comp, {}
|
69
|
+
|
70
|
+
def generate_stream(
|
71
|
+
self,
|
72
|
+
s: StreamExecutor,
|
73
|
+
sampling_params: SglSamplingParams,
|
74
|
+
):
|
75
|
+
if s.messages_:
|
76
|
+
messages = s.messages_
|
77
|
+
else:
|
78
|
+
messages = [{"role": "user", "content": s.text_}]
|
79
|
+
|
80
|
+
ret = litellm.completion(
|
81
|
+
model=self.model_name,
|
82
|
+
messages=messages,
|
83
|
+
stream=True,
|
84
|
+
**self.client_params,
|
85
|
+
**sampling_params.to_litellm_kwargs(),
|
86
|
+
)
|
87
|
+
for chunk in ret:
|
88
|
+
text = chunk.choices[0].delta.content
|
89
|
+
if text is not None:
|
90
|
+
yield text, {}
|
sglang/backend/openai.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
import warnings
|
4
|
+
import dataclasses
|
3
5
|
from typing import Callable, List, Optional, Union
|
4
6
|
|
5
7
|
import numpy as np
|
@@ -41,6 +43,15 @@ INSTRUCT_MODEL_NAMES = [
|
|
41
43
|
]
|
42
44
|
|
43
45
|
|
46
|
+
@dataclasses.dataclass
|
47
|
+
class TokenUsage:
|
48
|
+
prompt_tokens: int
|
49
|
+
completion_tokens: int
|
50
|
+
|
51
|
+
def reset(self):
|
52
|
+
self.prompt_tokens = self.completion_tokens = 0
|
53
|
+
|
54
|
+
|
44
55
|
class OpenAI(BaseBackend):
|
45
56
|
def __init__(
|
46
57
|
self,
|
@@ -80,40 +91,89 @@ class OpenAI(BaseBackend):
|
|
80
91
|
else:
|
81
92
|
self.is_chat_model = True
|
82
93
|
|
83
|
-
self.
|
94
|
+
self.chat_prefix = self.chat_template.role_prefix_and_suffix["assistant"][0]
|
95
|
+
|
96
|
+
# Usage
|
97
|
+
self.token_usage = TokenUsage(0, 0)
|
98
|
+
|
99
|
+
# API speculative execution
|
100
|
+
# TODO(ying): This does not support multi-threading (run_batch)
|
101
|
+
self.spec_kwargs = {}
|
102
|
+
self.spec_format = []
|
103
|
+
self.spec_max_num_tries = 3
|
84
104
|
|
85
105
|
def get_chat_template(self):
|
86
106
|
return self.chat_template
|
87
107
|
|
108
|
+
def _prepare_spec_execution(self, sampling_params: SglSamplingParams,
|
109
|
+
num_api_spec_tokens: int, spec_var_name: str):
|
110
|
+
if "max_tokens" not in self.spec_kwargs:
|
111
|
+
self.spec_kwargs["max_tokens"] = num_api_spec_tokens
|
112
|
+
else:
|
113
|
+
assert (
|
114
|
+
self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
115
|
+
)
|
116
|
+
|
117
|
+
params = sampling_params.to_openai_kwargs()
|
118
|
+
for key, value in params.items():
|
119
|
+
if key in ["stop"]:
|
120
|
+
continue
|
121
|
+
if key in ["max_tokens"]:
|
122
|
+
warnings.warn(
|
123
|
+
"The parameter max_tokens will be overwritten by speculated number of tokens."
|
124
|
+
)
|
125
|
+
continue
|
126
|
+
if key not in self.spec_kwargs:
|
127
|
+
self.spec_kwargs[key] = value
|
128
|
+
else:
|
129
|
+
assert (
|
130
|
+
value == self.spec_kwargs[key]
|
131
|
+
), "sampling parameters should be consistent if turn on api speculative execution."
|
132
|
+
self.spec_format.append(
|
133
|
+
{"text": "", "stop": params["stop"], "name": spec_var_name}
|
134
|
+
)
|
135
|
+
return "", {}
|
136
|
+
|
88
137
|
def generate(
|
89
138
|
self,
|
90
139
|
s: StreamExecutor,
|
91
140
|
sampling_params: SglSamplingParams,
|
141
|
+
spec_var_name: str = None,
|
92
142
|
):
|
93
143
|
if sampling_params.dtype is None:
|
94
144
|
if self.is_chat_model:
|
95
|
-
if
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
145
|
+
if s.num_api_spec_tokens is None:
|
146
|
+
if not s.text_.endswith(self.chat_prefix):
|
147
|
+
raise RuntimeError(
|
148
|
+
"This use case is not supported if api speculative execution is off. "
|
149
|
+
"For OpenAI chat models, sgl.gen must be right after sgl.assistant. "
|
150
|
+
"Example of adding api speculative execution: @function(num_api_spec_tokens=128)."
|
151
|
+
)
|
152
|
+
prompt = s.messages_
|
153
|
+
else:
|
154
|
+
return self._prepare_spec_execution(sampling_params,
|
155
|
+
s.num_api_spec_tokens, spec_var_name)
|
101
156
|
else:
|
102
157
|
prompt = s.text_
|
103
158
|
|
104
159
|
kwargs = sampling_params.to_openai_kwargs()
|
105
160
|
comp = openai_completion(
|
106
161
|
client=self.client,
|
162
|
+
token_usage=self.token_usage,
|
107
163
|
is_chat=self.is_chat_model,
|
108
164
|
model=self.model_name,
|
109
165
|
prompt=prompt,
|
110
166
|
**kwargs,
|
111
167
|
)
|
112
168
|
elif sampling_params.dtype in [str, "str", "string"]:
|
169
|
+
assert (
|
170
|
+
not self.is_chat_model
|
171
|
+
), "constrained type not supported on chat model"
|
113
172
|
kwargs = sampling_params.to_openai_kwargs()
|
114
173
|
kwargs.pop("stop")
|
115
174
|
comp = openai_completion(
|
116
175
|
client=self.client,
|
176
|
+
token_usage=self.token_usage,
|
117
177
|
is_chat=self.is_chat_model,
|
118
178
|
model=self.model_name,
|
119
179
|
prompt=s.text_ + '"',
|
@@ -122,10 +182,14 @@ class OpenAI(BaseBackend):
|
|
122
182
|
)
|
123
183
|
comp = '"' + comp + '"'
|
124
184
|
elif sampling_params.dtype in [int, "int"]:
|
185
|
+
assert (
|
186
|
+
not self.is_chat_model
|
187
|
+
), "constrained type not supported on chat model"
|
125
188
|
kwargs = sampling_params.to_openai_kwargs()
|
126
189
|
kwargs.pop("stop")
|
127
190
|
comp = openai_completion(
|
128
191
|
client=self.client,
|
192
|
+
token_usage=self.token_usage,
|
129
193
|
is_chat=self.is_chat_model,
|
130
194
|
model=self.model_name,
|
131
195
|
prompt=s.text_,
|
@@ -138,6 +202,63 @@ class OpenAI(BaseBackend):
|
|
138
202
|
|
139
203
|
return comp, {}
|
140
204
|
|
205
|
+
def spec_fill(self, value: str):
|
206
|
+
assert self.is_chat_model
|
207
|
+
self.spec_format.append({"text": value, "stop": None, "name": None})
|
208
|
+
|
209
|
+
def spec_pattern_match(self, comp):
|
210
|
+
for i, term in enumerate(self.spec_format):
|
211
|
+
text = term["text"]
|
212
|
+
if text != "":
|
213
|
+
if comp.startswith(text):
|
214
|
+
comp = comp[len(text) :]
|
215
|
+
else:
|
216
|
+
return False
|
217
|
+
else:
|
218
|
+
pos = comp.find(term["stop"])
|
219
|
+
if pos != -1:
|
220
|
+
term["text"] = comp[:pos]
|
221
|
+
comp = comp[pos:]
|
222
|
+
else:
|
223
|
+
if i == len(self.spec_format) - 1:
|
224
|
+
term["text"] = comp
|
225
|
+
else:
|
226
|
+
return False
|
227
|
+
return True
|
228
|
+
|
229
|
+
def role_end_generate(
|
230
|
+
self,
|
231
|
+
s: StreamExecutor,
|
232
|
+
):
|
233
|
+
if s.num_api_spec_tokens is None or not s.text_.endswith(self.chat_prefix):
|
234
|
+
return
|
235
|
+
|
236
|
+
comp = ""
|
237
|
+
if not all(x["name"] is None for x in self.spec_format):
|
238
|
+
# TODO(ying): throw errors or warnings
|
239
|
+
for i in range(self.spec_max_num_tries):
|
240
|
+
comp = openai_completion(
|
241
|
+
client=self.client,
|
242
|
+
token_usage=self.token_usage,
|
243
|
+
is_chat=self.is_chat_model,
|
244
|
+
model=self.model_name,
|
245
|
+
prompt=s.messages_,
|
246
|
+
**self.spec_kwargs,
|
247
|
+
)
|
248
|
+
if self.spec_pattern_match(comp):
|
249
|
+
break
|
250
|
+
|
251
|
+
for term in self.spec_format:
|
252
|
+
s.text_ += term["text"]
|
253
|
+
name = term["name"]
|
254
|
+
if name is not None:
|
255
|
+
s.variables[name] = term["text"]
|
256
|
+
s.meta_info[name] = {}
|
257
|
+
s.variable_event[name].set()
|
258
|
+
|
259
|
+
self.spec_kwargs = {}
|
260
|
+
self.spec_format = []
|
261
|
+
|
141
262
|
def generate_stream(
|
142
263
|
self,
|
143
264
|
s: StreamExecutor,
|
@@ -145,7 +266,7 @@ class OpenAI(BaseBackend):
|
|
145
266
|
):
|
146
267
|
if sampling_params.dtype is None:
|
147
268
|
if self.is_chat_model:
|
148
|
-
if not s.text_.endswith(self.
|
269
|
+
if not s.text_.endswith(self.chat_prefix):
|
149
270
|
raise RuntimeError(
|
150
271
|
"This use case is not supported. "
|
151
272
|
"For OpenAI chat models, sgl.gen must be right after sgl.assistant"
|
@@ -157,6 +278,7 @@ class OpenAI(BaseBackend):
|
|
157
278
|
kwargs = sampling_params.to_openai_kwargs()
|
158
279
|
generator = openai_completion_stream(
|
159
280
|
client=self.client,
|
281
|
+
token_usage=self.token_usage,
|
160
282
|
is_chat=self.is_chat_model,
|
161
283
|
model=self.model_name,
|
162
284
|
prompt=prompt,
|
@@ -202,6 +324,8 @@ class OpenAI(BaseBackend):
|
|
202
324
|
)
|
203
325
|
ret_str = ret.choices[0].text
|
204
326
|
ret_token = self.tokenizer.encode(ret_str)[0]
|
327
|
+
self.token_usage.prompt_tokens += ret.usage.prompt_tokens
|
328
|
+
self.token_usage.completion_tokens= ret.usage.completion_tokens
|
205
329
|
|
206
330
|
# TODO:
|
207
331
|
# 1. return logits as the scores
|
@@ -231,7 +355,7 @@ class OpenAI(BaseBackend):
|
|
231
355
|
return decision, scores, None, None
|
232
356
|
|
233
357
|
|
234
|
-
def openai_completion(client,
|
358
|
+
def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
|
235
359
|
for attempt in range(retries):
|
236
360
|
try:
|
237
361
|
if is_chat:
|
@@ -245,6 +369,9 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
|
245
369
|
comp = [c.text for c in ret.choices]
|
246
370
|
else:
|
247
371
|
comp = ret.choices[0].text
|
372
|
+
|
373
|
+
token_usage.prompt_tokens += ret.usage.prompt_tokens
|
374
|
+
token_usage.completion_tokens += ret.usage.completion_tokens
|
248
375
|
break
|
249
376
|
except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
|
250
377
|
logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
|
@@ -258,16 +385,19 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
|
258
385
|
return comp
|
259
386
|
|
260
387
|
|
261
|
-
def openai_completion_stream(client,
|
388
|
+
def openai_completion_stream(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
|
262
389
|
for attempt in range(retries):
|
263
390
|
try:
|
264
391
|
if is_chat:
|
265
392
|
if "stop" in kwargs and kwargs["stop"] is None:
|
266
393
|
kwargs.pop("stop")
|
267
394
|
generator = client.chat.completions.create(
|
268
|
-
messages=prompt, stream=True,
|
395
|
+
messages=prompt, stream=True, stream_options={"include_usage": True},
|
396
|
+
**kwargs
|
269
397
|
)
|
270
398
|
for ret in generator:
|
399
|
+
if len(ret.choices) == 0:
|
400
|
+
continue
|
271
401
|
try:
|
272
402
|
content = ret.choices[0].delta.content
|
273
403
|
except IndexError:
|
@@ -275,11 +405,17 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa
|
|
275
405
|
yield content or "", {}
|
276
406
|
else:
|
277
407
|
generator = client.completions.create(
|
278
|
-
prompt=prompt, stream=True,
|
408
|
+
prompt=prompt, stream=True, stream_options={"include_usage": True},
|
409
|
+
**kwargs
|
279
410
|
)
|
280
411
|
for ret in generator:
|
412
|
+
if len(ret.choices) == 0:
|
413
|
+
continue
|
281
414
|
content = ret.choices[0].text
|
282
415
|
yield content or "", {}
|
416
|
+
|
417
|
+
token_usage.prompt_tokens += ret.usage.prompt_tokens
|
418
|
+
token_usage.completion_tokens += ret.usage.completion_tokens
|
283
419
|
break
|
284
420
|
except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
|
285
421
|
logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
|
@@ -34,7 +34,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
34
34
|
api_key=self.api_key,
|
35
35
|
verify=self.verify,
|
36
36
|
)
|
37
|
-
|
37
|
+
self._assert_success(res)
|
38
38
|
self.model_info = res.json()
|
39
39
|
|
40
40
|
self.chat_template = get_chat_template_by_model_path(
|
@@ -50,7 +50,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
50
50
|
auth_token=self.auth_token,
|
51
51
|
verify=self.verify,
|
52
52
|
)
|
53
|
-
|
53
|
+
self._assert_success(res)
|
54
54
|
|
55
55
|
def get_server_args(self):
|
56
56
|
res = http_request(
|
@@ -58,6 +58,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
58
58
|
auth_token=self.auth_token,
|
59
59
|
verify=self.verify,
|
60
60
|
)
|
61
|
+
self._assert_success(res)
|
61
62
|
return res.json()
|
62
63
|
|
63
64
|
def get_chat_template(self):
|
@@ -71,7 +72,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
71
72
|
api_key=self.api_key,
|
72
73
|
verify=self.verify,
|
73
74
|
)
|
74
|
-
|
75
|
+
self._assert_success(res)
|
75
76
|
|
76
77
|
def commit_lazy_operations(self, s: StreamExecutor):
|
77
78
|
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
|
@@ -83,7 +84,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
83
84
|
api_key=self.api_key,
|
84
85
|
verify=self.verify,
|
85
86
|
)
|
86
|
-
|
87
|
+
self._assert_success(res)
|
87
88
|
|
88
89
|
def fill_image(self, s: StreamExecutor):
|
89
90
|
data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
|
@@ -95,7 +96,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
95
96
|
api_key=self.api_key,
|
96
97
|
verify=self.verify,
|
97
98
|
)
|
98
|
-
|
99
|
+
self._assert_success(res)
|
99
100
|
|
100
101
|
def generate(
|
101
102
|
self,
|
@@ -133,6 +134,8 @@ class RuntimeEndpoint(BaseBackend):
|
|
133
134
|
api_key=self.api_key,
|
134
135
|
verify=self.verify,
|
135
136
|
)
|
137
|
+
self._assert_success(res)
|
138
|
+
|
136
139
|
obj = res.json()
|
137
140
|
comp = obj["text"]
|
138
141
|
return comp, obj["meta_info"]
|
@@ -167,7 +170,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
167
170
|
data["stream"] = True
|
168
171
|
self._add_images(s, data)
|
169
172
|
|
170
|
-
|
173
|
+
res = http_request(
|
171
174
|
self.base_url + "/generate",
|
172
175
|
json=data,
|
173
176
|
stream=True,
|
@@ -175,10 +178,11 @@ class RuntimeEndpoint(BaseBackend):
|
|
175
178
|
api_key=self.api_key,
|
176
179
|
verify=self.verify,
|
177
180
|
)
|
181
|
+
self._assert_success(res)
|
178
182
|
pos = 0
|
179
183
|
|
180
184
|
incomplete_text = ""
|
181
|
-
for chunk in
|
185
|
+
for chunk in res.iter_lines(decode_unicode=False):
|
182
186
|
chunk = chunk.decode("utf-8")
|
183
187
|
if chunk and chunk.startswith("data:"):
|
184
188
|
if chunk == "data: [DONE]":
|
@@ -211,7 +215,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
211
215
|
api_key=self.api_key,
|
212
216
|
verify=self.verify,
|
213
217
|
)
|
214
|
-
|
218
|
+
self._assert_success(res)
|
215
219
|
prompt_len = res.json()["meta_info"]["prompt_tokens"]
|
216
220
|
|
217
221
|
# Compute logprob
|
@@ -229,7 +233,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
229
233
|
api_key=self.api_key,
|
230
234
|
verify=self.verify,
|
231
235
|
)
|
232
|
-
|
236
|
+
self._assert_success(res)
|
233
237
|
obj = res.json()
|
234
238
|
normalized_prompt_logprobs = [
|
235
239
|
r["meta_info"]["normalized_prompt_logprob"] for r in obj
|
@@ -253,9 +257,13 @@ class RuntimeEndpoint(BaseBackend):
|
|
253
257
|
api_key=self.api_key,
|
254
258
|
verify=self.verify,
|
255
259
|
)
|
256
|
-
|
260
|
+
self._assert_success(res)
|
257
261
|
|
258
262
|
def _add_images(self, s: StreamExecutor, data):
|
259
263
|
if s.images_:
|
260
264
|
assert len(s.images_) == 1, "Only support one image."
|
261
265
|
data["image_data"] = s.images_[0][1]
|
266
|
+
|
267
|
+
def _assert_success(self, res):
|
268
|
+
if res.status_code != 200:
|
269
|
+
raise RuntimeError(res.json())
|
sglang/global_config.py
CHANGED
@@ -16,7 +16,7 @@ class GlobalConfig:
|
|
16
16
|
|
17
17
|
# Optimization configs
|
18
18
|
self.eager_fill_image = False
|
19
|
-
self.
|
19
|
+
self.enable_precache_with_tracing = True
|
20
20
|
self.enable_parallel_encoding = True
|
21
21
|
self.enable_parallel_decoding = True
|
22
22
|
|
@@ -25,5 +25,15 @@ class GlobalConfig:
|
|
25
25
|
# adjust_cache: Adjust the position embedding of KV cache.
|
26
26
|
self.concate_and_append_mode = "no_adjust"
|
27
27
|
|
28
|
+
# Request dependency time due to network delay
|
29
|
+
self.request_dependency_delay = 0.02
|
30
|
+
self.wait_for_new_request_delay = 0.0004
|
31
|
+
|
32
|
+
# New generation token ratio estimation
|
33
|
+
self.base_new_token_ratio = 0.4
|
34
|
+
self.base_min_new_token_ratio = 0.2
|
35
|
+
self.new_token_ratio_decay = 0.0001
|
36
|
+
self.new_token_ratio_recovery = 0.05
|
37
|
+
|
28
38
|
|
29
39
|
global_config = GlobalConfig()
|
sglang/lang/chat_template.py
CHANGED
@@ -259,6 +259,8 @@ def match_vicuna(model_path: str):
|
|
259
259
|
return get_chat_template("vicuna_v1.1")
|
260
260
|
if "llava-v1.5" in model_path.lower():
|
261
261
|
return get_chat_template("vicuna_v1.1")
|
262
|
+
if "llava-next-video-7b" in model_path.lower():
|
263
|
+
return get_chat_template("vicuna_v1.1")
|
262
264
|
|
263
265
|
|
264
266
|
@register_chat_template_matching_function
|
@@ -283,19 +285,24 @@ def match_llama3_instruct(model_path: str):
|
|
283
285
|
|
284
286
|
@register_chat_template_matching_function
|
285
287
|
def match_chat_ml(model_path: str):
|
288
|
+
# import pdb;pdb.set_trace()
|
286
289
|
model_path = model_path.lower()
|
287
290
|
if "tinyllama" in model_path:
|
288
291
|
return get_chat_template("chatml")
|
289
292
|
if "qwen" in model_path and "chat" in model_path:
|
290
293
|
return get_chat_template("chatml")
|
291
|
-
if
|
294
|
+
if (
|
295
|
+
"llava-v1.6-34b" in model_path
|
296
|
+
or "llava-v1.6-yi-34b" in model_path
|
297
|
+
or "llava-next-video-34b" in model_path
|
298
|
+
):
|
292
299
|
return get_chat_template("chatml-llava")
|
293
300
|
|
294
301
|
|
295
302
|
@register_chat_template_matching_function
|
296
303
|
def match_chat_yi(model_path: str):
|
297
304
|
model_path = model_path.lower()
|
298
|
-
if "yi" in model_path:
|
305
|
+
if "yi" in model_path and "llava" not in model_path:
|
299
306
|
return get_chat_template("yi")
|
300
307
|
|
301
308
|
|