sglang 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. sglang/__init__.py +57 -2
  2. sglang/api.py +8 -5
  3. sglang/backend/anthropic.py +18 -4
  4. sglang/backend/openai.py +2 -1
  5. sglang/backend/runtime_endpoint.py +18 -5
  6. sglang/backend/vertexai.py +1 -0
  7. sglang/global_config.py +5 -1
  8. sglang/lang/chat_template.py +83 -2
  9. sglang/lang/interpreter.py +92 -35
  10. sglang/lang/ir.py +12 -9
  11. sglang/lang/tracer.py +6 -4
  12. sglang/launch_server_llavavid.py +31 -0
  13. sglang/srt/constrained/fsm_cache.py +1 -0
  14. sglang/srt/constrained/jump_forward.py +1 -0
  15. sglang/srt/conversation.py +2 -2
  16. sglang/srt/flush_cache.py +16 -0
  17. sglang/srt/hf_transformers_utils.py +10 -2
  18. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  19. sglang/srt/layers/extend_attention.py +1 -0
  20. sglang/srt/layers/logits_processor.py +114 -54
  21. sglang/srt/layers/radix_attention.py +2 -1
  22. sglang/srt/layers/token_attention.py +1 -0
  23. sglang/srt/managers/detokenizer_manager.py +5 -1
  24. sglang/srt/managers/io_struct.py +27 -3
  25. sglang/srt/managers/router/infer_batch.py +97 -48
  26. sglang/srt/managers/router/manager.py +11 -8
  27. sglang/srt/managers/router/model_rpc.py +169 -90
  28. sglang/srt/managers/router/model_runner.py +110 -166
  29. sglang/srt/managers/router/radix_cache.py +89 -51
  30. sglang/srt/managers/router/scheduler.py +17 -28
  31. sglang/srt/managers/tokenizer_manager.py +110 -33
  32. sglang/srt/memory_pool.py +5 -14
  33. sglang/srt/model_config.py +11 -0
  34. sglang/srt/models/commandr.py +372 -0
  35. sglang/srt/models/dbrx.py +412 -0
  36. sglang/srt/models/dbrx_config.py +281 -0
  37. sglang/srt/models/gemma.py +24 -25
  38. sglang/srt/models/llama2.py +25 -26
  39. sglang/srt/models/llava.py +8 -10
  40. sglang/srt/models/llavavid.py +307 -0
  41. sglang/srt/models/mixtral.py +29 -33
  42. sglang/srt/models/qwen.py +34 -25
  43. sglang/srt/models/qwen2.py +25 -26
  44. sglang/srt/models/stablelm.py +26 -26
  45. sglang/srt/models/yivl.py +3 -5
  46. sglang/srt/openai_api_adapter.py +356 -0
  47. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
  48. sglang/srt/sampling_params.py +2 -0
  49. sglang/srt/server.py +91 -456
  50. sglang/srt/server_args.py +79 -49
  51. sglang/srt/utils.py +212 -47
  52. sglang/srt/weight_utils.py +417 -0
  53. sglang/test/test_programs.py +8 -7
  54. sglang/test/test_utils.py +195 -7
  55. sglang/utils.py +77 -26
  56. {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/METADATA +20 -18
  57. sglang-0.1.16.dist-info/RECORD +72 -0
  58. sglang-0.1.14.dist-info/RECORD +0 -64
  59. {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/LICENSE +0 -0
  60. {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/WHEEL +0 -0
  61. {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,356 @@
1
+ """Conversion between OpenAI APIs and native SRT APIs"""
2
+ import json
3
+ import os
4
+
5
+ from fastapi import HTTPException, Request
6
+ from fastapi.responses import StreamingResponse
7
+
8
+ from sglang.srt.conversation import (
9
+ Conversation,
10
+ SeparatorStyle,
11
+ chat_template_exists,
12
+ generate_chat_conv,
13
+ register_conv_template,
14
+ )
15
+ from sglang.srt.managers.io_struct import GenerateReqInput
16
+ from sglang.srt.openai_protocol import (
17
+ ChatCompletionRequest,
18
+ ChatCompletionResponse,
19
+ ChatCompletionResponseChoice,
20
+ ChatCompletionResponseStreamChoice,
21
+ ChatCompletionStreamResponse,
22
+ ChatMessage,
23
+ CompletionRequest,
24
+ CompletionResponse,
25
+ CompletionResponseChoice,
26
+ CompletionResponseStreamChoice,
27
+ CompletionStreamResponse,
28
+ DeltaMessage,
29
+ LogProbs,
30
+ UsageInfo,
31
+ )
32
+ from sglang.srt.utils import jsonify_pydantic_model
33
+
34
+
35
+ chat_template_name = None
36
+
37
+ def load_chat_template_for_openai_api(chat_template_arg):
38
+ global chat_template_name
39
+
40
+ print(f"Use chat template: {chat_template_arg}")
41
+ if not chat_template_exists(chat_template_arg):
42
+ if not os.path.exists(chat_template_arg):
43
+ raise RuntimeError(
44
+ f"Chat template {chat_template_arg} is not a built-in template name "
45
+ "or a valid chat template file path."
46
+ )
47
+ with open(chat_template_arg, "r") as filep:
48
+ template = json.load(filep)
49
+ try:
50
+ sep_style = SeparatorStyle[template["sep_style"]]
51
+ except KeyError:
52
+ raise ValueError(
53
+ f"Unknown separator style: {template['sep_style']}"
54
+ ) from None
55
+ register_conv_template(
56
+ Conversation(
57
+ name=template["name"],
58
+ system_template=template["system"] + "\n{system_message}",
59
+ system_message=template.get("system_message", ""),
60
+ roles=(template["user"], template["assistant"]),
61
+ sep_style=sep_style,
62
+ sep=template.get("sep", "\n"),
63
+ stop_str=template["stop_str"],
64
+ ),
65
+ override=True,
66
+ )
67
+ chat_template_name = template["name"]
68
+ else:
69
+ chat_template_name = chat_template_arg
70
+
71
+
72
+ async def v1_completions(tokenizer_manager, raw_request: Request):
73
+ request_json = await raw_request.json()
74
+ request = CompletionRequest(**request_json)
75
+
76
+ # TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
77
+ assert request.n == 1
78
+
79
+ adapted_request = GenerateReqInput(
80
+ text=request.prompt,
81
+ sampling_params={
82
+ "temperature": request.temperature,
83
+ "max_new_tokens": request.max_tokens,
84
+ "stop": request.stop,
85
+ "top_p": request.top_p,
86
+ "presence_penalty": request.presence_penalty,
87
+ "frequency_penalty": request.frequency_penalty,
88
+ "regex": request.regex,
89
+ },
90
+ return_logprob=request.logprobs is not None and request.logprobs > 0,
91
+ top_logprobs_num=request.logprobs if request.logprobs is not None else 0,
92
+ return_text_in_logprobs=True,
93
+ stream=request.stream,
94
+ )
95
+ adapted_request.post_init()
96
+
97
+ if adapted_request.stream:
98
+
99
+ async def generate_stream_resp():
100
+ stream_buffer = ""
101
+ n_prev_token = 0
102
+ async for content in tokenizer_manager.generate_request(adapted_request):
103
+ text = content["text"]
104
+ prompt_tokens = content["meta_info"]["prompt_tokens"]
105
+ completion_tokens = content["meta_info"]["completion_tokens"]
106
+
107
+ if not stream_buffer: # The first chunk
108
+ if request.echo:
109
+ # Prepend prompt in response text.
110
+ text = request.prompt + text
111
+
112
+ if request.logprobs:
113
+ # The first chunk and echo is enabled.
114
+ if not stream_buffer and request.echo:
115
+ prefill_token_logprobs = content["meta_info"][
116
+ "prefill_token_logprobs"
117
+ ]
118
+ prefill_top_logprobs = content["meta_info"][
119
+ "prefill_top_logprobs"
120
+ ]
121
+ else:
122
+ prefill_token_logprobs = None
123
+ prefill_top_logprobs = None
124
+
125
+ logprobs = to_openai_style_logprobs(
126
+ prefill_token_logprobs=prefill_token_logprobs,
127
+ prefill_top_logprobs=prefill_top_logprobs,
128
+ decode_token_logprobs=content["meta_info"][
129
+ "decode_token_logprobs"
130
+ ][n_prev_token:],
131
+ decode_top_logprobs=content["meta_info"]["decode_top_logprobs"][
132
+ n_prev_token:
133
+ ],
134
+ )
135
+
136
+ n_prev_token = len(content["meta_info"]["decode_token_logprobs"])
137
+ else:
138
+ logprobs = None
139
+
140
+ delta = text[len(stream_buffer) :]
141
+ stream_buffer = content["text"]
142
+ choice_data = CompletionResponseStreamChoice(
143
+ index=0,
144
+ text=delta,
145
+ logprobs=logprobs,
146
+ finish_reason=None,
147
+ )
148
+ chunk = CompletionStreamResponse(
149
+ id=content["meta_info"]["id"],
150
+ object="text_completion",
151
+ choices=[choice_data],
152
+ model=request.model,
153
+ usage=UsageInfo(
154
+ prompt_tokens=prompt_tokens,
155
+ completion_tokens=completion_tokens,
156
+ total_tokens=prompt_tokens + completion_tokens,
157
+ ),
158
+ )
159
+ yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
160
+ yield "data: [DONE]\n\n"
161
+
162
+ return StreamingResponse(generate_stream_resp(), media_type="text/event-stream")
163
+
164
+ # Non-streaming response.
165
+ ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
166
+ ret = ret[0] if isinstance(ret, list) else ret
167
+
168
+ prompt_tokens = ret["meta_info"]["prompt_tokens"]
169
+ completion_tokens = ret["meta_info"]["completion_tokens"]
170
+ text = ret["text"]
171
+ if request.echo:
172
+ text = request.prompt + text
173
+
174
+ if request.logprobs:
175
+ if request.echo:
176
+ prefill_token_logprobs = ret["meta_info"]["prefill_token_logprobs"]
177
+ prefill_top_logprobs = ret["meta_info"]["prefill_top_logprobs"]
178
+ else:
179
+ prefill_token_logprobs = None
180
+ prefill_top_logprobs = None
181
+
182
+ logprobs = to_openai_style_logprobs(
183
+ prefill_token_logprobs=prefill_token_logprobs,
184
+ prefill_top_logprobs=prefill_top_logprobs,
185
+ decode_token_logprobs=ret["meta_info"]["decode_token_logprobs"],
186
+ decode_top_logprobs=ret["meta_info"]["decode_top_logprobs"],
187
+ )
188
+ else:
189
+ logprobs = None
190
+
191
+ choice_data = CompletionResponseChoice(
192
+ index=0,
193
+ text=text,
194
+ logprobs=logprobs,
195
+ finish_reason=None, # TODO(comaniac): Add finish reason.
196
+ )
197
+ response = CompletionResponse(
198
+ id=ret["meta_info"]["id"],
199
+ model=request.model,
200
+ choices=[choice_data],
201
+ usage=UsageInfo(
202
+ prompt_tokens=prompt_tokens,
203
+ completion_tokens=completion_tokens,
204
+ total_tokens=prompt_tokens + completion_tokens,
205
+ ),
206
+ )
207
+ return response
208
+
209
+
210
+ async def v1_chat_completions(tokenizer_manager, raw_request: Request):
211
+ request_json = await raw_request.json()
212
+ request = ChatCompletionRequest(**request_json)
213
+
214
+ # TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
215
+ assert request.n == 1
216
+
217
+ # Prep the data needed for the underlying GenerateReqInput:
218
+ # - prompt: The full prompt string.
219
+ # - stop: Custom stop tokens.
220
+ # - image_data: None or a list of image strings (URLs or base64 strings).
221
+ # None skips any image processing in GenerateReqInput.
222
+ if not isinstance(request.messages, str):
223
+ # Apply chat template and its stop strings.
224
+ if chat_template_name is None:
225
+ prompt = tokenizer_manager.tokenizer.apply_chat_template(
226
+ request.messages, tokenize=False, add_generation_prompt=True
227
+ )
228
+ stop = request.stop
229
+ image_data = None
230
+ else:
231
+ conv = generate_chat_conv(request, chat_template_name)
232
+ prompt = conv.get_prompt()
233
+ image_data = conv.image_data
234
+ stop = conv.stop_str or []
235
+ if request.stop:
236
+ if isinstance(request.stop, str):
237
+ stop.append(request.stop)
238
+ else:
239
+ stop.extend(request.stop)
240
+ else:
241
+ # Use the raw prompt and stop strings if the messages is already a string.
242
+ prompt = request.messages
243
+ stop = request.stop
244
+ image_data = None
245
+
246
+ adapted_request = GenerateReqInput(
247
+ text=prompt,
248
+ image_data=image_data,
249
+ sampling_params={
250
+ "temperature": request.temperature,
251
+ "max_new_tokens": request.max_tokens,
252
+ "stop": stop,
253
+ "top_p": request.top_p,
254
+ "presence_penalty": request.presence_penalty,
255
+ "frequency_penalty": request.frequency_penalty,
256
+ "regex": request.regex,
257
+ },
258
+ stream=request.stream,
259
+ )
260
+ adapted_request.post_init()
261
+
262
+ if adapted_request.stream:
263
+
264
+ async def generate_stream_resp():
265
+ is_first = True
266
+
267
+ stream_buffer = ""
268
+ async for content in tokenizer_manager.generate_request(adapted_request):
269
+ if is_first:
270
+ # First chunk with role
271
+ is_first = False
272
+ choice_data = ChatCompletionResponseStreamChoice(
273
+ index=0,
274
+ delta=DeltaMessage(role="assistant"),
275
+ finish_reason=None,
276
+ )
277
+ chunk = ChatCompletionStreamResponse(
278
+ id=content["meta_info"]["id"],
279
+ choices=[choice_data],
280
+ model=request.model,
281
+ )
282
+ yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
283
+
284
+ text = content["text"]
285
+ delta = text[len(stream_buffer) :]
286
+ stream_buffer = text
287
+ choice_data = ChatCompletionResponseStreamChoice(
288
+ index=0, delta=DeltaMessage(content=delta), finish_reason=None
289
+ )
290
+ chunk = ChatCompletionStreamResponse(
291
+ id=content["meta_info"]["id"],
292
+ choices=[choice_data],
293
+ model=request.model,
294
+ )
295
+ yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
296
+ yield "data: [DONE]\n\n"
297
+
298
+ return StreamingResponse(generate_stream_resp(), media_type="text/event-stream")
299
+
300
+ # Non-streaming response.
301
+ ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
302
+ prompt_tokens = ret["meta_info"]["prompt_tokens"]
303
+ completion_tokens = ret["meta_info"]["completion_tokens"]
304
+ choice_data = ChatCompletionResponseChoice(
305
+ index=0,
306
+ message=ChatMessage(role="assistant", content=ret["text"]),
307
+ finish_reason=None, # TODO(comaniac): Add finish reason.
308
+ )
309
+ response = ChatCompletionResponse(
310
+ id=ret["meta_info"]["id"],
311
+ model=request.model,
312
+ choices=[choice_data],
313
+ usage=UsageInfo(
314
+ prompt_tokens=prompt_tokens,
315
+ completion_tokens=completion_tokens,
316
+ total_tokens=prompt_tokens + completion_tokens,
317
+ ),
318
+ )
319
+ return response
320
+
321
+
322
+ def to_openai_style_logprobs(
323
+ prefill_token_logprobs=None,
324
+ decode_token_logprobs=None,
325
+ prefill_top_logprobs=None,
326
+ decode_top_logprobs=None,
327
+ ):
328
+ ret_logprobs = LogProbs()
329
+
330
+ def append_token_logprobs(token_logprobs):
331
+ for logprob, _, token_text in token_logprobs:
332
+ ret_logprobs.tokens.append(token_text)
333
+ ret_logprobs.token_logprobs.append(logprob)
334
+
335
+ # Not Supported yet
336
+ ret_logprobs.text_offset.append(-1)
337
+
338
+ def append_top_logprobs(top_logprobs):
339
+ for tokens in top_logprobs:
340
+ if tokens is not None:
341
+ ret_logprobs.top_logprobs.append(
342
+ {token[2]: token[0] for token in tokens}
343
+ )
344
+ else:
345
+ ret_logprobs.top_logprobs.append(None)
346
+
347
+ if prefill_token_logprobs is not None:
348
+ append_token_logprobs(prefill_token_logprobs)
349
+ if decode_token_logprobs is not None:
350
+ append_token_logprobs(decode_token_logprobs)
351
+ if prefill_top_logprobs is not None:
352
+ append_top_logprobs(prefill_top_logprobs)
353
+ if decode_top_logprobs is not None:
354
+ append_top_logprobs(decode_top_logprobs)
355
+
356
+ return ret_logprobs
@@ -1,3 +1,4 @@
1
+ """pydantic models for OpenAI API protocol"""
1
2
  import time
2
3
  from typing import Dict, List, Optional, Union
3
4
 
@@ -19,21 +20,24 @@ class UsageInfo(BaseModel):
19
20
 
20
21
 
21
22
  class CompletionRequest(BaseModel):
23
+ # Ordered by official OpenAI API documentation
24
+ # https://platform.openai.com/docs/api-reference/completions/create
22
25
  model: str
23
- prompt: Union[str, List[str]]
24
- suffix: Optional[str] = None
25
- max_tokens: Optional[int] = 16
26
- temperature: Optional[float] = 0.7
27
- top_p: Optional[float] = 1.0
28
- n: Optional[int] = 1
29
- stream: Optional[bool] = False
30
- logprobs: Optional[int] = None
26
+ prompt: Union[List[int], List[List[int]], str, List[str]]
27
+ best_of: Optional[int] = None
31
28
  echo: Optional[bool] = False
32
- stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
33
- presence_penalty: Optional[float] = 0.0
34
29
  frequency_penalty: Optional[float] = 0.0
35
- best_of: Optional[int] = None
36
30
  logit_bias: Optional[Dict[str, float]] = None
31
+ logprobs: Optional[int] = None
32
+ max_tokens: Optional[int] = 16
33
+ n: int = 1
34
+ presence_penalty: Optional[float] = 0.0
35
+ seed: Optional[int] = None
36
+ stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
37
+ stream: Optional[bool] = False
38
+ suffix: Optional[str] = None
39
+ temperature: Optional[float] = 1.0
40
+ top_p: Optional[float] = 1.0
37
41
  user: Optional[str] = None
38
42
 
39
43
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
@@ -107,20 +111,30 @@ ChatCompletionMessageParam = Union[
107
111
  ]
108
112
 
109
113
 
114
+ class ResponseFormat(BaseModel):
115
+ # type must be "json_object" or "text"
116
+ type: Literal["text", "json_object"]
117
+
118
+
110
119
  class ChatCompletionRequest(BaseModel):
120
+ # Ordered by official OpenAI API documentation
121
+ # https://platform.openai.com/docs/api-reference/chat/create
122
+ messages: List[ChatCompletionMessageParam]
111
123
  model: str
112
- messages: Union[str, List[ChatCompletionMessageParam]]
113
- temperature: Optional[float] = 0.7
114
- top_p: Optional[float] = 1.0
124
+ frequency_penalty: Optional[float] = 0.0
125
+ logit_bias: Optional[Dict[str, float]] = None
126
+ logprobs: Optional[bool] = False
127
+ top_logprobs: Optional[int] = None
128
+ max_tokens: Optional[int] = None
115
129
  n: Optional[int] = 1
116
- max_tokens: Optional[int] = 16
130
+ presence_penalty: Optional[float] = 0.0
131
+ response_format: Optional[ResponseFormat] = None
132
+ seed: Optional[int] = None
117
133
  stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
118
134
  stream: Optional[bool] = False
119
- presence_penalty: Optional[float] = 0.0
120
- frequency_penalty: Optional[float] = 0.0
121
- logit_bias: Optional[Dict[str, float]] = None
135
+ temperature: Optional[float] = 0.7
136
+ top_p: Optional[float] = 1.0
122
137
  user: Optional[str] = None
123
- best_of: Optional[int] = None
124
138
 
125
139
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
126
140
  regex: Optional[str] = None
@@ -134,6 +148,7 @@ class ChatMessage(BaseModel):
134
148
  class ChatCompletionResponseChoice(BaseModel):
135
149
  index: int
136
150
  message: ChatMessage
151
+ logprobs: Optional[LogProbs] = None
137
152
  finish_reason: Optional[str] = None
138
153
 
139
154
 
@@ -154,6 +169,7 @@ class DeltaMessage(BaseModel):
154
169
  class ChatCompletionResponseStreamChoice(BaseModel):
155
170
  index: int
156
171
  delta: DeltaMessage
172
+ logprobs: Optional[LogProbs] = None
157
173
  finish_reason: Optional[str] = None
158
174
 
159
175
 
@@ -162,4 +178,4 @@ class ChatCompletionStreamResponse(BaseModel):
162
178
  object: str = "chat.completion.chunk"
163
179
  created: int = Field(default_factory=lambda: int(time.time()))
164
180
  model: str
165
- choices: List[ChatCompletionResponseStreamChoice]
181
+ choices: List[ChatCompletionResponseStreamChoice]
@@ -17,6 +17,7 @@ class SamplingParams:
17
17
  presence_penalty: float = 0.0,
18
18
  ignore_eos: bool = False,
19
19
  skip_special_tokens: bool = True,
20
+ spaces_between_special_tokens: bool = True,
20
21
  dtype: Optional[str] = None,
21
22
  regex: Optional[str] = None,
22
23
  ) -> None:
@@ -29,6 +30,7 @@ class SamplingParams:
29
30
  self.max_new_tokens = max_new_tokens
30
31
  self.ignore_eos = ignore_eos
31
32
  self.skip_special_tokens = skip_special_tokens
33
+ self.spaces_between_special_tokens = spaces_between_special_tokens
32
34
  self.dtype = dtype
33
35
  self.regex = regex
34
36