sglang 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sglang/__init__.py +5 -1
  2. sglang/api.py +8 -3
  3. sglang/backend/anthropic.py +1 -1
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +148 -12
  6. sglang/backend/runtime_endpoint.py +18 -10
  7. sglang/global_config.py +11 -1
  8. sglang/lang/chat_template.py +9 -2
  9. sglang/lang/interpreter.py +161 -81
  10. sglang/lang/ir.py +29 -11
  11. sglang/lang/tracer.py +1 -1
  12. sglang/launch_server.py +1 -2
  13. sglang/launch_server_llavavid.py +31 -0
  14. sglang/srt/constrained/fsm_cache.py +3 -0
  15. sglang/srt/flush_cache.py +16 -0
  16. sglang/srt/hf_transformers_utils.py +83 -2
  17. sglang/srt/layers/extend_attention.py +17 -0
  18. sglang/srt/layers/fused_moe.py +485 -0
  19. sglang/srt/layers/logits_processor.py +12 -7
  20. sglang/srt/layers/radix_attention.py +10 -3
  21. sglang/srt/layers/token_attention.py +16 -1
  22. sglang/srt/managers/controller/dp_worker.py +110 -0
  23. sglang/srt/managers/controller/infer_batch.py +619 -0
  24. sglang/srt/managers/controller/manager_multi.py +191 -0
  25. sglang/srt/managers/controller/manager_single.py +97 -0
  26. sglang/srt/managers/controller/model_runner.py +462 -0
  27. sglang/srt/managers/controller/radix_cache.py +267 -0
  28. sglang/srt/managers/controller/schedule_heuristic.py +59 -0
  29. sglang/srt/managers/controller/tp_worker.py +791 -0
  30. sglang/srt/managers/detokenizer_manager.py +45 -45
  31. sglang/srt/managers/io_struct.py +26 -10
  32. sglang/srt/managers/router/infer_batch.py +130 -74
  33. sglang/srt/managers/router/manager.py +7 -9
  34. sglang/srt/managers/router/model_rpc.py +224 -135
  35. sglang/srt/managers/router/model_runner.py +94 -107
  36. sglang/srt/managers/router/radix_cache.py +54 -18
  37. sglang/srt/managers/router/scheduler.py +23 -34
  38. sglang/srt/managers/tokenizer_manager.py +183 -88
  39. sglang/srt/model_config.py +5 -2
  40. sglang/srt/models/commandr.py +15 -22
  41. sglang/srt/models/dbrx.py +22 -29
  42. sglang/srt/models/gemma.py +14 -24
  43. sglang/srt/models/grok.py +671 -0
  44. sglang/srt/models/llama2.py +24 -23
  45. sglang/srt/models/llava.py +85 -25
  46. sglang/srt/models/llavavid.py +298 -0
  47. sglang/srt/models/mixtral.py +254 -130
  48. sglang/srt/models/mixtral_quant.py +373 -0
  49. sglang/srt/models/qwen.py +28 -25
  50. sglang/srt/models/qwen2.py +17 -22
  51. sglang/srt/models/stablelm.py +21 -26
  52. sglang/srt/models/yivl.py +17 -25
  53. sglang/srt/openai_api_adapter.py +140 -95
  54. sglang/srt/openai_protocol.py +10 -1
  55. sglang/srt/server.py +101 -52
  56. sglang/srt/server_args.py +59 -11
  57. sglang/srt/utils.py +242 -75
  58. sglang/test/test_programs.py +44 -0
  59. sglang/test/test_utils.py +32 -1
  60. sglang/utils.py +95 -26
  61. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/METADATA +23 -13
  62. sglang-0.1.17.dist-info/RECORD +81 -0
  63. sglang/srt/backend_config.py +0 -13
  64. sglang/srt/models/dbrx_config.py +0 -281
  65. sglang/srt/weight_utils.py +0 -402
  66. sglang-0.1.15.dist-info/RECORD +0 -69
  67. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/LICENSE +0 -0
  68. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/WHEEL +0 -0
  69. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/top_level.txt +0 -0
sglang/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.15"
1
+ __version__ = "0.1.17"
2
2
 
3
3
  # SGL API Components
4
4
  from sglang.api import (
@@ -19,6 +19,7 @@ from sglang.api import (
19
19
  user,
20
20
  user_begin,
21
21
  user_end,
22
+ video,
22
23
  )
23
24
 
24
25
  # SGL Backends
@@ -26,6 +27,7 @@ from sglang.backend.anthropic import Anthropic
26
27
  from sglang.backend.openai import OpenAI
27
28
  from sglang.backend.runtime_endpoint import RuntimeEndpoint
28
29
  from sglang.backend.vertexai import VertexAI
30
+ from sglang.backend.litellm import LiteLLM
29
31
 
30
32
  # Global Configurations
31
33
  from sglang.global_config import global_config
@@ -34,6 +36,7 @@ from sglang.global_config import global_config
34
36
  __all__ = [
35
37
  "global_config",
36
38
  "Anthropic",
39
+ "LiteLLM",
37
40
  "OpenAI",
38
41
  "RuntimeEndpoint",
39
42
  "VertexAI",
@@ -46,6 +49,7 @@ __all__ = [
46
49
  "gen_int",
47
50
  "gen_string",
48
51
  "image",
52
+ "video",
49
53
  "select",
50
54
  "system",
51
55
  "user",
sglang/api.py CHANGED
@@ -15,17 +15,18 @@ from sglang.lang.ir import (
15
15
  SglRoleBegin,
16
16
  SglRoleEnd,
17
17
  SglSelect,
18
+ SglVideo,
18
19
  )
19
20
 
20
21
 
21
22
  def function(
22
- func: Optional[Callable] = None, api_num_spec_tokens: Optional[int] = None
23
+ func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
23
24
  ):
24
25
  if func:
25
- return SglFunction(func, api_num_spec_tokens=api_num_spec_tokens)
26
+ return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
26
27
 
27
28
  def decorator(func):
28
- return SglFunction(func, api_num_spec_tokens=api_num_spec_tokens)
29
+ return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
29
30
 
30
31
  return decorator
31
32
 
@@ -151,6 +152,10 @@ def image(expr: SglExpr):
151
152
  return SglImage(expr)
152
153
 
153
154
 
155
+ def video(path: str, num_frames: int):
156
+ return SglVideo(path, num_frames)
157
+
158
+
154
159
  def select(
155
160
  name: Optional[str] = None,
156
161
  choices: List[str] = None,
@@ -74,4 +74,4 @@ class Anthropic(BaseBackend):
74
74
  **sampling_params.to_anthropic_kwargs(),
75
75
  ) as stream:
76
76
  for text in stream.text_stream:
77
- yield text, {}
77
+ yield text, {}
@@ -0,0 +1,90 @@
1
+ from typing import Mapping, Optional
2
+
3
+ from sglang.backend.base_backend import BaseBackend
4
+ from sglang.lang.chat_template import get_chat_template_by_model_path
5
+ from sglang.lang.interpreter import StreamExecutor
6
+ from sglang.lang.ir import SglSamplingParams
7
+
8
+ try:
9
+ import litellm
10
+ except ImportError as e:
11
+ litellm = e
12
+ litellm.num_retries = 1
13
+
14
+
15
+ class LiteLLM(BaseBackend):
16
+
17
+ def __init__(
18
+ self,
19
+ model_name,
20
+ chat_template=None,
21
+ api_key=None,
22
+ organization: Optional[str] = None,
23
+ base_url: Optional[str] = None,
24
+ timeout: Optional[float] = 600,
25
+ max_retries: Optional[int] = litellm.num_retries,
26
+ default_headers: Optional[Mapping[str, str]] = None,
27
+ ):
28
+ super().__init__()
29
+
30
+ if isinstance(litellm, Exception):
31
+ raise litellm
32
+
33
+ self.model_name = model_name
34
+
35
+ self.chat_template = chat_template or get_chat_template_by_model_path(
36
+ model_name)
37
+
38
+ self.client_params = {
39
+ "api_key": api_key,
40
+ "organization": organization,
41
+ "base_url": base_url,
42
+ "timeout": timeout,
43
+ "max_retries": max_retries,
44
+ "default_headers": default_headers,
45
+ }
46
+
47
+ def get_chat_template(self):
48
+ return self.chat_template
49
+
50
+ def generate(
51
+ self,
52
+ s: StreamExecutor,
53
+ sampling_params: SglSamplingParams,
54
+ ):
55
+ if s.messages_:
56
+ messages = s.messages_
57
+ else:
58
+ messages = [{"role": "user", "content": s.text_}]
59
+
60
+ ret = litellm.completion(
61
+ model=self.model_name,
62
+ messages=messages,
63
+ **self.client_params,
64
+ **sampling_params.to_anthropic_kwargs(),
65
+ )
66
+ comp = ret.choices[0].message.content
67
+
68
+ return comp, {}
69
+
70
+ def generate_stream(
71
+ self,
72
+ s: StreamExecutor,
73
+ sampling_params: SglSamplingParams,
74
+ ):
75
+ if s.messages_:
76
+ messages = s.messages_
77
+ else:
78
+ messages = [{"role": "user", "content": s.text_}]
79
+
80
+ ret = litellm.completion(
81
+ model=self.model_name,
82
+ messages=messages,
83
+ stream=True,
84
+ **self.client_params,
85
+ **sampling_params.to_litellm_kwargs(),
86
+ )
87
+ for chunk in ret:
88
+ text = chunk.choices[0].delta.content
89
+ if text is not None:
90
+ yield text, {}
sglang/backend/openai.py CHANGED
@@ -1,5 +1,7 @@
1
1
  import logging
2
2
  import time
3
+ import warnings
4
+ import dataclasses
3
5
  from typing import Callable, List, Optional, Union
4
6
 
5
7
  import numpy as np
@@ -41,6 +43,15 @@ INSTRUCT_MODEL_NAMES = [
41
43
  ]
42
44
 
43
45
 
46
+ @dataclasses.dataclass
47
+ class TokenUsage:
48
+ prompt_tokens: int
49
+ completion_tokens: int
50
+
51
+ def reset(self):
52
+ self.prompt_tokens = self.completion_tokens = 0
53
+
54
+
44
55
  class OpenAI(BaseBackend):
45
56
  def __init__(
46
57
  self,
@@ -80,40 +91,89 @@ class OpenAI(BaseBackend):
80
91
  else:
81
92
  self.is_chat_model = True
82
93
 
83
- self.chat_begin_str = self.chat_template.role_prefix_and_suffix["assistant"][0]
94
+ self.chat_prefix = self.chat_template.role_prefix_and_suffix["assistant"][0]
95
+
96
+ # Usage
97
+ self.token_usage = TokenUsage(0, 0)
98
+
99
+ # API speculative execution
100
+ # TODO(ying): This does not support multi-threading (run_batch)
101
+ self.spec_kwargs = {}
102
+ self.spec_format = []
103
+ self.spec_max_num_tries = 3
84
104
 
85
105
  def get_chat_template(self):
86
106
  return self.chat_template
87
107
 
108
+ def _prepare_spec_execution(self, sampling_params: SglSamplingParams,
109
+ num_api_spec_tokens: int, spec_var_name: str):
110
+ if "max_tokens" not in self.spec_kwargs:
111
+ self.spec_kwargs["max_tokens"] = num_api_spec_tokens
112
+ else:
113
+ assert (
114
+ self.spec_kwargs["max_tokens"] == num_api_spec_tokens
115
+ )
116
+
117
+ params = sampling_params.to_openai_kwargs()
118
+ for key, value in params.items():
119
+ if key in ["stop"]:
120
+ continue
121
+ if key in ["max_tokens"]:
122
+ warnings.warn(
123
+ "The parameter max_tokens will be overwritten by speculated number of tokens."
124
+ )
125
+ continue
126
+ if key not in self.spec_kwargs:
127
+ self.spec_kwargs[key] = value
128
+ else:
129
+ assert (
130
+ value == self.spec_kwargs[key]
131
+ ), "sampling parameters should be consistent if turn on api speculative execution."
132
+ self.spec_format.append(
133
+ {"text": "", "stop": params["stop"], "name": spec_var_name}
134
+ )
135
+ return "", {}
136
+
88
137
  def generate(
89
138
  self,
90
139
  s: StreamExecutor,
91
140
  sampling_params: SglSamplingParams,
141
+ spec_var_name: str = None,
92
142
  ):
93
143
  if sampling_params.dtype is None:
94
144
  if self.is_chat_model:
95
- if not s.text_.endswith(self.chat_begin_str):
96
- raise RuntimeError(
97
- "This use case is not supported. "
98
- "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
99
- )
100
- prompt = s.messages_
145
+ if s.num_api_spec_tokens is None:
146
+ if not s.text_.endswith(self.chat_prefix):
147
+ raise RuntimeError(
148
+ "This use case is not supported if api speculative execution is off. "
149
+ "For OpenAI chat models, sgl.gen must be right after sgl.assistant. "
150
+ "Example of adding api speculative execution: @function(num_api_spec_tokens=128)."
151
+ )
152
+ prompt = s.messages_
153
+ else:
154
+ return self._prepare_spec_execution(sampling_params,
155
+ s.num_api_spec_tokens, spec_var_name)
101
156
  else:
102
157
  prompt = s.text_
103
158
 
104
159
  kwargs = sampling_params.to_openai_kwargs()
105
160
  comp = openai_completion(
106
161
  client=self.client,
162
+ token_usage=self.token_usage,
107
163
  is_chat=self.is_chat_model,
108
164
  model=self.model_name,
109
165
  prompt=prompt,
110
166
  **kwargs,
111
167
  )
112
168
  elif sampling_params.dtype in [str, "str", "string"]:
169
+ assert (
170
+ not self.is_chat_model
171
+ ), "constrained type not supported on chat model"
113
172
  kwargs = sampling_params.to_openai_kwargs()
114
173
  kwargs.pop("stop")
115
174
  comp = openai_completion(
116
175
  client=self.client,
176
+ token_usage=self.token_usage,
117
177
  is_chat=self.is_chat_model,
118
178
  model=self.model_name,
119
179
  prompt=s.text_ + '"',
@@ -122,10 +182,14 @@ class OpenAI(BaseBackend):
122
182
  )
123
183
  comp = '"' + comp + '"'
124
184
  elif sampling_params.dtype in [int, "int"]:
185
+ assert (
186
+ not self.is_chat_model
187
+ ), "constrained type not supported on chat model"
125
188
  kwargs = sampling_params.to_openai_kwargs()
126
189
  kwargs.pop("stop")
127
190
  comp = openai_completion(
128
191
  client=self.client,
192
+ token_usage=self.token_usage,
129
193
  is_chat=self.is_chat_model,
130
194
  model=self.model_name,
131
195
  prompt=s.text_,
@@ -138,6 +202,63 @@ class OpenAI(BaseBackend):
138
202
 
139
203
  return comp, {}
140
204
 
205
+ def spec_fill(self, value: str):
206
+ assert self.is_chat_model
207
+ self.spec_format.append({"text": value, "stop": None, "name": None})
208
+
209
+ def spec_pattern_match(self, comp):
210
+ for i, term in enumerate(self.spec_format):
211
+ text = term["text"]
212
+ if text != "":
213
+ if comp.startswith(text):
214
+ comp = comp[len(text) :]
215
+ else:
216
+ return False
217
+ else:
218
+ pos = comp.find(term["stop"])
219
+ if pos != -1:
220
+ term["text"] = comp[:pos]
221
+ comp = comp[pos:]
222
+ else:
223
+ if i == len(self.spec_format) - 1:
224
+ term["text"] = comp
225
+ else:
226
+ return False
227
+ return True
228
+
229
+ def role_end_generate(
230
+ self,
231
+ s: StreamExecutor,
232
+ ):
233
+ if s.num_api_spec_tokens is None or not s.text_.endswith(self.chat_prefix):
234
+ return
235
+
236
+ comp = ""
237
+ if not all(x["name"] is None for x in self.spec_format):
238
+ # TODO(ying): throw errors or warnings
239
+ for i in range(self.spec_max_num_tries):
240
+ comp = openai_completion(
241
+ client=self.client,
242
+ token_usage=self.token_usage,
243
+ is_chat=self.is_chat_model,
244
+ model=self.model_name,
245
+ prompt=s.messages_,
246
+ **self.spec_kwargs,
247
+ )
248
+ if self.spec_pattern_match(comp):
249
+ break
250
+
251
+ for term in self.spec_format:
252
+ s.text_ += term["text"]
253
+ name = term["name"]
254
+ if name is not None:
255
+ s.variables[name] = term["text"]
256
+ s.meta_info[name] = {}
257
+ s.variable_event[name].set()
258
+
259
+ self.spec_kwargs = {}
260
+ self.spec_format = []
261
+
141
262
  def generate_stream(
142
263
  self,
143
264
  s: StreamExecutor,
@@ -145,7 +266,7 @@ class OpenAI(BaseBackend):
145
266
  ):
146
267
  if sampling_params.dtype is None:
147
268
  if self.is_chat_model:
148
- if not s.text_.endswith(self.chat_begin_str):
269
+ if not s.text_.endswith(self.chat_prefix):
149
270
  raise RuntimeError(
150
271
  "This use case is not supported. "
151
272
  "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
@@ -157,6 +278,7 @@ class OpenAI(BaseBackend):
157
278
  kwargs = sampling_params.to_openai_kwargs()
158
279
  generator = openai_completion_stream(
159
280
  client=self.client,
281
+ token_usage=self.token_usage,
160
282
  is_chat=self.is_chat_model,
161
283
  model=self.model_name,
162
284
  prompt=prompt,
@@ -202,6 +324,8 @@ class OpenAI(BaseBackend):
202
324
  )
203
325
  ret_str = ret.choices[0].text
204
326
  ret_token = self.tokenizer.encode(ret_str)[0]
327
+ self.token_usage.prompt_tokens += ret.usage.prompt_tokens
328
+ self.token_usage.completion_tokens= ret.usage.completion_tokens
205
329
 
206
330
  # TODO:
207
331
  # 1. return logits as the scores
@@ -231,7 +355,7 @@ class OpenAI(BaseBackend):
231
355
  return decision, scores, None, None
232
356
 
233
357
 
234
- def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
358
+ def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
235
359
  for attempt in range(retries):
236
360
  try:
237
361
  if is_chat:
@@ -245,6 +369,9 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
245
369
  comp = [c.text for c in ret.choices]
246
370
  else:
247
371
  comp = ret.choices[0].text
372
+
373
+ token_usage.prompt_tokens += ret.usage.prompt_tokens
374
+ token_usage.completion_tokens += ret.usage.completion_tokens
248
375
  break
249
376
  except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
250
377
  logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
@@ -258,16 +385,19 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
258
385
  return comp
259
386
 
260
387
 
261
- def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwargs):
388
+ def openai_completion_stream(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
262
389
  for attempt in range(retries):
263
390
  try:
264
391
  if is_chat:
265
392
  if "stop" in kwargs and kwargs["stop"] is None:
266
393
  kwargs.pop("stop")
267
394
  generator = client.chat.completions.create(
268
- messages=prompt, stream=True, **kwargs
395
+ messages=prompt, stream=True, stream_options={"include_usage": True},
396
+ **kwargs
269
397
  )
270
398
  for ret in generator:
399
+ if len(ret.choices) == 0:
400
+ continue
271
401
  try:
272
402
  content = ret.choices[0].delta.content
273
403
  except IndexError:
@@ -275,11 +405,17 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa
275
405
  yield content or "", {}
276
406
  else:
277
407
  generator = client.completions.create(
278
- prompt=prompt, stream=True, **kwargs
408
+ prompt=prompt, stream=True, stream_options={"include_usage": True},
409
+ **kwargs
279
410
  )
280
411
  for ret in generator:
412
+ if len(ret.choices) == 0:
413
+ continue
281
414
  content = ret.choices[0].text
282
415
  yield content or "", {}
416
+
417
+ token_usage.prompt_tokens += ret.usage.prompt_tokens
418
+ token_usage.completion_tokens += ret.usage.completion_tokens
283
419
  break
284
420
  except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
285
421
  logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
@@ -34,7 +34,7 @@ class RuntimeEndpoint(BaseBackend):
34
34
  api_key=self.api_key,
35
35
  verify=self.verify,
36
36
  )
37
- assert res.status_code == 200
37
+ self._assert_success(res)
38
38
  self.model_info = res.json()
39
39
 
40
40
  self.chat_template = get_chat_template_by_model_path(
@@ -50,7 +50,7 @@ class RuntimeEndpoint(BaseBackend):
50
50
  auth_token=self.auth_token,
51
51
  verify=self.verify,
52
52
  )
53
- return res.status_code == 200
53
+ self._assert_success(res)
54
54
 
55
55
  def get_server_args(self):
56
56
  res = http_request(
@@ -58,6 +58,7 @@ class RuntimeEndpoint(BaseBackend):
58
58
  auth_token=self.auth_token,
59
59
  verify=self.verify,
60
60
  )
61
+ self._assert_success(res)
61
62
  return res.json()
62
63
 
63
64
  def get_chat_template(self):
@@ -71,7 +72,7 @@ class RuntimeEndpoint(BaseBackend):
71
72
  api_key=self.api_key,
72
73
  verify=self.verify,
73
74
  )
74
- assert res.status_code == 200
75
+ self._assert_success(res)
75
76
 
76
77
  def commit_lazy_operations(self, s: StreamExecutor):
77
78
  data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
@@ -83,7 +84,7 @@ class RuntimeEndpoint(BaseBackend):
83
84
  api_key=self.api_key,
84
85
  verify=self.verify,
85
86
  )
86
- assert res.status_code == 200
87
+ self._assert_success(res)
87
88
 
88
89
  def fill_image(self, s: StreamExecutor):
89
90
  data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
@@ -95,7 +96,7 @@ class RuntimeEndpoint(BaseBackend):
95
96
  api_key=self.api_key,
96
97
  verify=self.verify,
97
98
  )
98
- assert res.status_code == 200
99
+ self._assert_success(res)
99
100
 
100
101
  def generate(
101
102
  self,
@@ -133,6 +134,8 @@ class RuntimeEndpoint(BaseBackend):
133
134
  api_key=self.api_key,
134
135
  verify=self.verify,
135
136
  )
137
+ self._assert_success(res)
138
+
136
139
  obj = res.json()
137
140
  comp = obj["text"]
138
141
  return comp, obj["meta_info"]
@@ -167,7 +170,7 @@ class RuntimeEndpoint(BaseBackend):
167
170
  data["stream"] = True
168
171
  self._add_images(s, data)
169
172
 
170
- response = http_request(
173
+ res = http_request(
171
174
  self.base_url + "/generate",
172
175
  json=data,
173
176
  stream=True,
@@ -175,10 +178,11 @@ class RuntimeEndpoint(BaseBackend):
175
178
  api_key=self.api_key,
176
179
  verify=self.verify,
177
180
  )
181
+ self._assert_success(res)
178
182
  pos = 0
179
183
 
180
184
  incomplete_text = ""
181
- for chunk in response.iter_lines(decode_unicode=False):
185
+ for chunk in res.iter_lines(decode_unicode=False):
182
186
  chunk = chunk.decode("utf-8")
183
187
  if chunk and chunk.startswith("data:"):
184
188
  if chunk == "data: [DONE]":
@@ -211,7 +215,7 @@ class RuntimeEndpoint(BaseBackend):
211
215
  api_key=self.api_key,
212
216
  verify=self.verify,
213
217
  )
214
- assert res.status_code == 200
218
+ self._assert_success(res)
215
219
  prompt_len = res.json()["meta_info"]["prompt_tokens"]
216
220
 
217
221
  # Compute logprob
@@ -229,7 +233,7 @@ class RuntimeEndpoint(BaseBackend):
229
233
  api_key=self.api_key,
230
234
  verify=self.verify,
231
235
  )
232
- assert res.status_code == 200
236
+ self._assert_success(res)
233
237
  obj = res.json()
234
238
  normalized_prompt_logprobs = [
235
239
  r["meta_info"]["normalized_prompt_logprob"] for r in obj
@@ -253,9 +257,13 @@ class RuntimeEndpoint(BaseBackend):
253
257
  api_key=self.api_key,
254
258
  verify=self.verify,
255
259
  )
256
- assert res.status_code == 200
260
+ self._assert_success(res)
257
261
 
258
262
  def _add_images(self, s: StreamExecutor, data):
259
263
  if s.images_:
260
264
  assert len(s.images_) == 1, "Only support one image."
261
265
  data["image_data"] = s.images_[0][1]
266
+
267
+ def _assert_success(self, res):
268
+ if res.status_code != 200:
269
+ raise RuntimeError(res.json())
sglang/global_config.py CHANGED
@@ -16,7 +16,7 @@ class GlobalConfig:
16
16
 
17
17
  # Optimization configs
18
18
  self.eager_fill_image = False
19
- self.enable_prefix_sharing = True
19
+ self.enable_precache_with_tracing = True
20
20
  self.enable_parallel_encoding = True
21
21
  self.enable_parallel_decoding = True
22
22
 
@@ -25,5 +25,15 @@ class GlobalConfig:
25
25
  # adjust_cache: Adjust the position embedding of KV cache.
26
26
  self.concate_and_append_mode = "no_adjust"
27
27
 
28
+ # Request dependency time due to network delay
29
+ self.request_dependency_delay = 0.02
30
+ self.wait_for_new_request_delay = 0.0004
31
+
32
+ # New generation token ratio estimation
33
+ self.base_new_token_ratio = 0.4
34
+ self.base_min_new_token_ratio = 0.2
35
+ self.new_token_ratio_decay = 0.0001
36
+ self.new_token_ratio_recovery = 0.05
37
+
28
38
 
29
39
  global_config = GlobalConfig()
@@ -259,6 +259,8 @@ def match_vicuna(model_path: str):
259
259
  return get_chat_template("vicuna_v1.1")
260
260
  if "llava-v1.5" in model_path.lower():
261
261
  return get_chat_template("vicuna_v1.1")
262
+ if "llava-next-video-7b" in model_path.lower():
263
+ return get_chat_template("vicuna_v1.1")
262
264
 
263
265
 
264
266
  @register_chat_template_matching_function
@@ -283,19 +285,24 @@ def match_llama3_instruct(model_path: str):
283
285
 
284
286
  @register_chat_template_matching_function
285
287
  def match_chat_ml(model_path: str):
288
+ # import pdb;pdb.set_trace()
286
289
  model_path = model_path.lower()
287
290
  if "tinyllama" in model_path:
288
291
  return get_chat_template("chatml")
289
292
  if "qwen" in model_path and "chat" in model_path:
290
293
  return get_chat_template("chatml")
291
- if "llava-v1.6-34b" in model_path:
294
+ if (
295
+ "llava-v1.6-34b" in model_path
296
+ or "llava-v1.6-yi-34b" in model_path
297
+ or "llava-next-video-34b" in model_path
298
+ ):
292
299
  return get_chat_template("chatml-llava")
293
300
 
294
301
 
295
302
  @register_chat_template_matching_function
296
303
  def match_chat_yi(model_path: str):
297
304
  model_path = model_path.lower()
298
- if "yi" in model_path:
305
+ if "yi" in model_path and "llava" not in model_path:
299
306
  return get_chat_template("yi")
300
307
 
301
308