sglang 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sglang/__init__.py +5 -1
  2. sglang/api.py +8 -3
  3. sglang/backend/anthropic.py +1 -1
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +148 -12
  6. sglang/backend/runtime_endpoint.py +18 -10
  7. sglang/global_config.py +11 -1
  8. sglang/lang/chat_template.py +9 -2
  9. sglang/lang/interpreter.py +161 -81
  10. sglang/lang/ir.py +29 -11
  11. sglang/lang/tracer.py +1 -1
  12. sglang/launch_server.py +1 -2
  13. sglang/launch_server_llavavid.py +31 -0
  14. sglang/srt/constrained/fsm_cache.py +3 -0
  15. sglang/srt/flush_cache.py +16 -0
  16. sglang/srt/hf_transformers_utils.py +83 -2
  17. sglang/srt/layers/extend_attention.py +17 -0
  18. sglang/srt/layers/fused_moe.py +485 -0
  19. sglang/srt/layers/logits_processor.py +12 -7
  20. sglang/srt/layers/radix_attention.py +10 -3
  21. sglang/srt/layers/token_attention.py +16 -1
  22. sglang/srt/managers/controller/dp_worker.py +110 -0
  23. sglang/srt/managers/controller/infer_batch.py +619 -0
  24. sglang/srt/managers/controller/manager_multi.py +191 -0
  25. sglang/srt/managers/controller/manager_single.py +97 -0
  26. sglang/srt/managers/controller/model_runner.py +462 -0
  27. sglang/srt/managers/controller/radix_cache.py +267 -0
  28. sglang/srt/managers/controller/schedule_heuristic.py +59 -0
  29. sglang/srt/managers/controller/tp_worker.py +791 -0
  30. sglang/srt/managers/detokenizer_manager.py +45 -45
  31. sglang/srt/managers/io_struct.py +26 -10
  32. sglang/srt/managers/router/infer_batch.py +130 -74
  33. sglang/srt/managers/router/manager.py +7 -9
  34. sglang/srt/managers/router/model_rpc.py +224 -135
  35. sglang/srt/managers/router/model_runner.py +94 -107
  36. sglang/srt/managers/router/radix_cache.py +54 -18
  37. sglang/srt/managers/router/scheduler.py +23 -34
  38. sglang/srt/managers/tokenizer_manager.py +183 -88
  39. sglang/srt/model_config.py +5 -2
  40. sglang/srt/models/commandr.py +15 -22
  41. sglang/srt/models/dbrx.py +22 -29
  42. sglang/srt/models/gemma.py +14 -24
  43. sglang/srt/models/grok.py +671 -0
  44. sglang/srt/models/llama2.py +24 -23
  45. sglang/srt/models/llava.py +85 -25
  46. sglang/srt/models/llavavid.py +298 -0
  47. sglang/srt/models/mixtral.py +254 -130
  48. sglang/srt/models/mixtral_quant.py +373 -0
  49. sglang/srt/models/qwen.py +28 -25
  50. sglang/srt/models/qwen2.py +17 -22
  51. sglang/srt/models/stablelm.py +21 -26
  52. sglang/srt/models/yivl.py +17 -25
  53. sglang/srt/openai_api_adapter.py +140 -95
  54. sglang/srt/openai_protocol.py +10 -1
  55. sglang/srt/server.py +101 -52
  56. sglang/srt/server_args.py +59 -11
  57. sglang/srt/utils.py +242 -75
  58. sglang/test/test_programs.py +44 -0
  59. sglang/test/test_utils.py +32 -1
  60. sglang/utils.py +95 -26
  61. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/METADATA +23 -13
  62. sglang-0.1.17.dist-info/RECORD +81 -0
  63. sglang/srt/backend_config.py +0 -13
  64. sglang/srt/models/dbrx_config.py +0 -281
  65. sglang/srt/weight_utils.py +0 -402
  66. sglang-0.1.15.dist-info/RECORD +0 -69
  67. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/LICENSE +0 -0
  68. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/WHEEL +0 -0
  69. {sglang-0.1.15.dist-info → sglang-0.1.17.dist-info}/top_level.txt +0 -0
sglang/srt/models/yivl.py CHANGED
@@ -1,43 +1,38 @@
1
1
  """Inference-only Yi-VL model."""
2
2
 
3
- import os
4
- from typing import List, Optional
3
+ from typing import Tuple, Iterable, Optional
5
4
 
6
5
  import torch
7
6
  import torch.nn as nn
8
7
  from transformers import CLIPVisionModel, LlavaConfig
9
- from sglang.srt.weight_utils import (
10
- default_weight_loader,
11
- hf_model_weights_iterator,
12
- )
8
+ from vllm.config import CacheConfig
9
+ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
13
10
 
11
+ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
14
12
  from sglang.srt.models.llava import (
15
13
  LlavaLlamaForCausalLM,
16
- clip_vision_embed_forward,
17
14
  monkey_path_clip_vision_embed_forward,
18
15
  )
19
16
 
20
17
 
21
18
  class YiVLForCausalLM(LlavaLlamaForCausalLM):
22
- def __init__(self, *args, **kwargs):
23
- self.config = kwargs["config"]
24
- super().__init__(self.config)
19
+ def __init__(
20
+ self,
21
+ config: LlavaConfig,
22
+ quant_config: Optional[QuantizationConfig] = None,
23
+ cache_config: Optional[CacheConfig] = None,
24
+ ) -> None:
25
+ super().__init__(config, quant_config, cache_config)
25
26
 
26
27
  self.multi_modal_projector = YiVLMultiModalProjector(self.config)
27
28
  self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
28
29
  "./", ""
29
30
  ) # Everything after "./"
30
31
 
31
- def load_weights(
32
- self,
33
- model_name_or_path: str,
34
- cache_dir: Optional[str] = None,
35
- load_format: str = "auto",
36
- revision: Optional[str] = None,
37
- ):
32
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
38
33
  # We have to use the subfolder of the main model directory (e.g. 01-ai/Yi-VL-6B)
39
34
  self.vision_tower = CLIPVisionModel.from_pretrained(
40
- model_name_or_path,
35
+ self.config._name_or_path,
41
36
  torch_dtype=torch.float16,
42
37
  subfolder=self.vision_tower_subfolder,
43
38
  ).cuda()
@@ -71,9 +66,8 @@ class YiVLForCausalLM(LlavaLlamaForCausalLM):
71
66
  "model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
72
67
  }
73
68
  params_dict = dict(self.named_parameters())
74
- for name, loaded_weight in hf_model_weights_iterator(
75
- model_name_or_path, cache_dir, load_format, revision
76
- ):
69
+ weights = list(weights)
70
+ for name, loaded_weight in weights:
77
71
  if "projector" in name or "vision_tower" in name:
78
72
  for weight_name, param_name in projector_weights.items():
79
73
  if weight_name in name:
@@ -83,9 +77,7 @@ class YiVLForCausalLM(LlavaLlamaForCausalLM):
83
77
  weight_loader(param, loaded_weight)
84
78
 
85
79
  # load language model
86
- self.language_model.load_weights(
87
- model_name_or_path, cache_dir, load_format, revision
88
- )
80
+ self.language_model.load_weights(weights)
89
81
 
90
82
  monkey_path_clip_vision_embed_forward()
91
83
 
@@ -106,7 +98,7 @@ class YiVLMultiModalProjector(nn.Module):
106
98
 
107
99
  def forward(self, image_features):
108
100
  hidden_states = self.linear_1(image_features)
109
- hidden_state = self.ln_1(hidden_states)
101
+ hidden_states = self.ln_1(hidden_states)
110
102
  hidden_states = self.act(hidden_states)
111
103
  hidden_states = self.linear_2(hidden_states)
112
104
  hidden_states = self.ln_2(hidden_states)
@@ -1,9 +1,12 @@
1
1
  """Conversion between OpenAI APIs and native SRT APIs"""
2
+
3
+ import asyncio
2
4
  import json
3
5
  import os
6
+ from http import HTTPStatus
4
7
 
5
- from fastapi import HTTPException, Request
6
- from fastapi.responses import StreamingResponse
8
+ from fastapi import Request
9
+ from fastapi.responses import StreamingResponse, JSONResponse
7
10
 
8
11
  from sglang.srt.conversation import (
9
12
  Conversation,
@@ -26,14 +29,36 @@ from sglang.srt.openai_protocol import (
26
29
  CompletionResponseStreamChoice,
27
30
  CompletionStreamResponse,
28
31
  DeltaMessage,
32
+ ErrorResponse,
29
33
  LogProbs,
30
34
  UsageInfo,
31
35
  )
32
- from sglang.srt.utils import jsonify_pydantic_model
33
-
34
36
 
35
37
  chat_template_name = None
36
38
 
39
+
40
+ def create_error_response(
41
+ message: str,
42
+ err_type: str = "BadRequestError",
43
+ status_code: HTTPStatus = HTTPStatus.BAD_REQUEST):
44
+ error = ErrorResponse(message=message,
45
+ type=err_type,
46
+ code=status_code.value)
47
+ return JSONResponse(content=error.model_dump(),
48
+ status_code=error.code)
49
+
50
+
51
+ def create_streaming_error_response(
52
+ message: str,
53
+ err_type: str = "BadRequestError",
54
+ status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
55
+ error = ErrorResponse(message=message,
56
+ type=err_type,
57
+ code=status_code.value)
58
+ json_str = json.dumps({"error": error.model_dump()})
59
+ return json_str
60
+
61
+
37
62
  def load_chat_template_for_openai_api(chat_template_arg):
38
63
  global chat_template_name
39
64
 
@@ -73,8 +98,8 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
73
98
  request_json = await raw_request.json()
74
99
  request = CompletionRequest(**request_json)
75
100
 
76
- # TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
77
- assert request.n == 1
101
+ if request.n != 1:
102
+ return create_error_response("n != 1 is not supported")
78
103
 
79
104
  adapted_request = GenerateReqInput(
80
105
  text=request.prompt,
@@ -92,79 +117,88 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
92
117
  return_text_in_logprobs=True,
93
118
  stream=request.stream,
94
119
  )
95
- adapted_request.post_init()
96
120
 
97
121
  if adapted_request.stream:
98
122
 
99
123
  async def generate_stream_resp():
100
124
  stream_buffer = ""
101
125
  n_prev_token = 0
102
- async for content in tokenizer_manager.generate_request(adapted_request):
103
- text = content["text"]
104
- prompt_tokens = content["meta_info"]["prompt_tokens"]
105
- completion_tokens = content["meta_info"]["completion_tokens"]
106
-
107
- if not stream_buffer: # The first chunk
108
- if request.echo:
109
- # Prepend prompt in response text.
110
- text = request.prompt + text
111
-
112
- if request.logprobs:
113
- # The first chunk and echo is enabled.
114
- if not stream_buffer and request.echo:
115
- prefill_token_logprobs = content["meta_info"][
116
- "prefill_token_logprobs"
117
- ]
118
- prefill_top_logprobs = content["meta_info"][
119
- "prefill_top_logprobs"
120
- ]
126
+ try:
127
+ async for content in tokenizer_manager.generate_request(
128
+ adapted_request, raw_request):
129
+ text = content["text"]
130
+ prompt_tokens = content["meta_info"]["prompt_tokens"]
131
+ completion_tokens = content["meta_info"]["completion_tokens"]
132
+
133
+ if not stream_buffer: # The first chunk
134
+ if request.echo:
135
+ # Prepend prompt in response text.
136
+ text = request.prompt + text
137
+
138
+ if request.logprobs:
139
+ # The first chunk and echo is enabled.
140
+ if not stream_buffer and request.echo:
141
+ prefill_token_logprobs = content["meta_info"][
142
+ "prefill_token_logprobs"
143
+ ]
144
+ prefill_top_logprobs = content["meta_info"][
145
+ "prefill_top_logprobs"
146
+ ]
147
+ else:
148
+ prefill_token_logprobs = None
149
+ prefill_top_logprobs = None
150
+
151
+ logprobs = to_openai_style_logprobs(
152
+ prefill_token_logprobs=prefill_token_logprobs,
153
+ prefill_top_logprobs=prefill_top_logprobs,
154
+ decode_token_logprobs=content["meta_info"][
155
+ "decode_token_logprobs"
156
+ ][n_prev_token:],
157
+ decode_top_logprobs=content["meta_info"]["decode_top_logprobs"][
158
+ n_prev_token:
159
+ ],
160
+ )
161
+
162
+ n_prev_token = len(content["meta_info"]["decode_token_logprobs"])
121
163
  else:
122
- prefill_token_logprobs = None
123
- prefill_top_logprobs = None
124
-
125
- logprobs = to_openai_style_logprobs(
126
- prefill_token_logprobs=prefill_token_logprobs,
127
- prefill_top_logprobs=prefill_top_logprobs,
128
- decode_token_logprobs=content["meta_info"][
129
- "decode_token_logprobs"
130
- ][n_prev_token:],
131
- decode_top_logprobs=content["meta_info"]["decode_top_logprobs"][
132
- n_prev_token:
133
- ],
134
- )
164
+ logprobs = None
135
165
 
136
- n_prev_token = len(content["meta_info"]["decode_token_logprobs"])
137
- else:
138
- logprobs = None
139
-
140
- delta = text[len(stream_buffer) :]
141
- stream_buffer = content["text"]
142
- choice_data = CompletionResponseStreamChoice(
143
- index=0,
144
- text=delta,
145
- logprobs=logprobs,
146
- finish_reason=None,
147
- )
148
- chunk = CompletionStreamResponse(
149
- id=content["meta_info"]["id"],
150
- object="text_completion",
151
- choices=[choice_data],
152
- model=request.model,
153
- usage=UsageInfo(
154
- prompt_tokens=prompt_tokens,
155
- completion_tokens=completion_tokens,
156
- total_tokens=prompt_tokens + completion_tokens,
157
- ),
158
- )
159
- yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
166
+ delta = text[len(stream_buffer) :]
167
+ stream_buffer = content["text"]
168
+ choice_data = CompletionResponseStreamChoice(
169
+ index=0,
170
+ text=delta,
171
+ logprobs=logprobs,
172
+ finish_reason=content["meta_info"]["finish_reason"],
173
+ )
174
+ chunk = CompletionStreamResponse(
175
+ id=content["meta_info"]["id"],
176
+ object="text_completion",
177
+ choices=[choice_data],
178
+ model=request.model,
179
+ usage=UsageInfo(
180
+ prompt_tokens=prompt_tokens,
181
+ completion_tokens=completion_tokens,
182
+ total_tokens=prompt_tokens + completion_tokens,
183
+ ),
184
+ )
185
+ yield f"data: {chunk.model_dump_json()}\n\n"
186
+ except ValueError as e:
187
+ error = create_streaming_error_response(str(e))
188
+ yield f"data: {error}\n\n"
160
189
  yield "data: [DONE]\n\n"
161
190
 
162
- return StreamingResponse(generate_stream_resp(), media_type="text/event-stream")
191
+ return StreamingResponse(generate_stream_resp(), media_type="text/event-stream",
192
+ background=tokenizer_manager.create_abort_task(adapted_request))
163
193
 
164
194
  # Non-streaming response.
165
- ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
166
- ret = ret[0] if isinstance(ret, list) else ret
195
+ try:
196
+ ret = await tokenizer_manager.generate_request(
197
+ adapted_request, raw_request).__anext__()
198
+ except ValueError as e:
199
+ return create_error_response(str(e))
167
200
 
201
+ ret = ret[0] if isinstance(ret, list) else ret
168
202
  prompt_tokens = ret["meta_info"]["prompt_tokens"]
169
203
  completion_tokens = ret["meta_info"]["completion_tokens"]
170
204
  text = ret["text"]
@@ -192,7 +226,7 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
192
226
  index=0,
193
227
  text=text,
194
228
  logprobs=logprobs,
195
- finish_reason=None, # TODO(comaniac): Add finish reason.
229
+ finish_reason=ret["meta_info"]["finish_reason"],
196
230
  )
197
231
  response = CompletionResponse(
198
232
  id=ret["meta_info"]["id"],
@@ -211,8 +245,8 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
211
245
  request_json = await raw_request.json()
212
246
  request = ChatCompletionRequest(**request_json)
213
247
 
214
- # TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
215
- assert request.n == 1
248
+ if request.n != 1:
249
+ return create_error_response("n != 1 is not supported")
216
250
 
217
251
  # Prep the data needed for the underlying GenerateReqInput:
218
252
  # - prompt: The full prompt string.
@@ -257,7 +291,6 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
257
291
  },
258
292
  stream=request.stream,
259
293
  )
260
- adapted_request.post_init()
261
294
 
262
295
  if adapted_request.stream:
263
296
 
@@ -265,46 +298,58 @@ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
265
298
  is_first = True
266
299
 
267
300
  stream_buffer = ""
268
- async for content in tokenizer_manager.generate_request(adapted_request):
269
- if is_first:
270
- # First chunk with role
271
- is_first = False
301
+ try:
302
+ async for content in tokenizer_manager.generate_request(adapted_request, raw_request):
303
+ if is_first:
304
+ # First chunk with role
305
+ is_first = False
306
+ choice_data = ChatCompletionResponseStreamChoice(
307
+ index=0,
308
+ delta=DeltaMessage(role="assistant"),
309
+ finish_reason=content["meta_info"]["finish_reason"],
310
+ )
311
+ chunk = ChatCompletionStreamResponse(
312
+ id=content["meta_info"]["id"],
313
+ choices=[choice_data],
314
+ model=request.model,
315
+ )
316
+ yield f"data: {chunk.model_dump_json()}\n\n"
317
+
318
+ text = content["text"]
319
+ delta = text[len(stream_buffer) :]
320
+ stream_buffer = text
272
321
  choice_data = ChatCompletionResponseStreamChoice(
273
322
  index=0,
274
- delta=DeltaMessage(role="assistant"),
275
- finish_reason=None,
323
+ delta=DeltaMessage(content=delta),
324
+ finish_reason=content["meta_info"]["finish_reason"],
276
325
  )
277
326
  chunk = ChatCompletionStreamResponse(
278
327
  id=content["meta_info"]["id"],
279
328
  choices=[choice_data],
280
329
  model=request.model,
281
330
  )
282
- yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
283
-
284
- text = content["text"]
285
- delta = text[len(stream_buffer) :]
286
- stream_buffer = text
287
- choice_data = ChatCompletionResponseStreamChoice(
288
- index=0, delta=DeltaMessage(content=delta), finish_reason=None
289
- )
290
- chunk = ChatCompletionStreamResponse(
291
- id=content["meta_info"]["id"],
292
- choices=[choice_data],
293
- model=request.model,
294
- )
295
- yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
331
+ yield f"data: {chunk.model_dump_json()}\n\n"
332
+ except ValueError as e:
333
+ error = create_streaming_error_response(str(e))
334
+ yield f"data: {error}\n\n"
296
335
  yield "data: [DONE]\n\n"
297
336
 
298
- return StreamingResponse(generate_stream_resp(), media_type="text/event-stream")
337
+ return StreamingResponse(generate_stream_resp(), media_type="text/event-stream",
338
+ background=tokenizer_manager.create_abort_task(adapted_request))
299
339
 
300
340
  # Non-streaming response.
301
- ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
341
+ try:
342
+ ret = await tokenizer_manager.generate_request(
343
+ adapted_request, raw_request).__anext__()
344
+ except ValueError as e:
345
+ return create_error_response(str(e))
346
+
302
347
  prompt_tokens = ret["meta_info"]["prompt_tokens"]
303
348
  completion_tokens = ret["meta_info"]["completion_tokens"]
304
349
  choice_data = ChatCompletionResponseChoice(
305
350
  index=0,
306
351
  message=ChatMessage(role="assistant", content=ret["text"]),
307
- finish_reason=None, # TODO(comaniac): Add finish reason.
352
+ finish_reason=ret["meta_info"]["finish_reason"],
308
353
  )
309
354
  response = ChatCompletionResponse(
310
355
  id=ret["meta_info"]["id"],
@@ -332,7 +377,7 @@ def to_openai_style_logprobs(
332
377
  ret_logprobs.tokens.append(token_text)
333
378
  ret_logprobs.token_logprobs.append(logprob)
334
379
 
335
- # Not Supported yet
380
+ # Not supported yet
336
381
  ret_logprobs.text_offset.append(-1)
337
382
 
338
383
  def append_top_logprobs(top_logprobs):
@@ -353,4 +398,4 @@ def to_openai_style_logprobs(
353
398
  if decode_top_logprobs is not None:
354
399
  append_top_logprobs(decode_top_logprobs)
355
400
 
356
- return ret_logprobs
401
+ return ret_logprobs
@@ -1,4 +1,5 @@
1
1
  """pydantic models for OpenAI API protocol"""
2
+
2
3
  import time
3
4
  from typing import Dict, List, Optional, Union
4
5
 
@@ -6,6 +7,14 @@ from pydantic import BaseModel, Field
6
7
  from typing_extensions import Literal
7
8
 
8
9
 
10
+ class ErrorResponse(BaseModel):
11
+ object: str = "error"
12
+ message: str
13
+ type: str
14
+ param: Optional[str] = None
15
+ code: int
16
+
17
+
9
18
  class LogProbs(BaseModel):
10
19
  text_offset: List[int] = Field(default_factory=list)
11
20
  token_logprobs: List[Optional[float]] = Field(default_factory=list)
@@ -178,4 +187,4 @@ class ChatCompletionStreamResponse(BaseModel):
178
187
  object: str = "chat.completion.chunk"
179
188
  created: int = Field(default_factory=lambda: int(time.time()))
180
189
  model: str
181
- choices: List[ChatCompletionResponseStreamChoice]
190
+ choices: List[ChatCompletionResponseStreamChoice]