sglang 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +57 -2
- sglang/api.py +8 -5
- sglang/backend/anthropic.py +18 -4
- sglang/backend/openai.py +2 -1
- sglang/backend/runtime_endpoint.py +18 -5
- sglang/backend/vertexai.py +1 -0
- sglang/global_config.py +5 -1
- sglang/lang/chat_template.py +83 -2
- sglang/lang/interpreter.py +92 -35
- sglang/lang/ir.py +12 -9
- sglang/lang/tracer.py +6 -4
- sglang/launch_server_llavavid.py +31 -0
- sglang/srt/constrained/fsm_cache.py +1 -0
- sglang/srt/constrained/jump_forward.py +1 -0
- sglang/srt/conversation.py +2 -2
- sglang/srt/flush_cache.py +16 -0
- sglang/srt/hf_transformers_utils.py +10 -2
- sglang/srt/layers/context_flashattention_nopad.py +1 -0
- sglang/srt/layers/extend_attention.py +1 -0
- sglang/srt/layers/logits_processor.py +114 -54
- sglang/srt/layers/radix_attention.py +2 -1
- sglang/srt/layers/token_attention.py +1 -0
- sglang/srt/managers/detokenizer_manager.py +5 -1
- sglang/srt/managers/io_struct.py +27 -3
- sglang/srt/managers/router/infer_batch.py +97 -48
- sglang/srt/managers/router/manager.py +11 -8
- sglang/srt/managers/router/model_rpc.py +169 -90
- sglang/srt/managers/router/model_runner.py +110 -166
- sglang/srt/managers/router/radix_cache.py +89 -51
- sglang/srt/managers/router/scheduler.py +17 -28
- sglang/srt/managers/tokenizer_manager.py +110 -33
- sglang/srt/memory_pool.py +5 -14
- sglang/srt/model_config.py +11 -0
- sglang/srt/models/commandr.py +372 -0
- sglang/srt/models/dbrx.py +412 -0
- sglang/srt/models/dbrx_config.py +281 -0
- sglang/srt/models/gemma.py +24 -25
- sglang/srt/models/llama2.py +25 -26
- sglang/srt/models/llava.py +8 -10
- sglang/srt/models/llavavid.py +307 -0
- sglang/srt/models/mixtral.py +29 -33
- sglang/srt/models/qwen.py +34 -25
- sglang/srt/models/qwen2.py +25 -26
- sglang/srt/models/stablelm.py +26 -26
- sglang/srt/models/yivl.py +3 -5
- sglang/srt/openai_api_adapter.py +356 -0
- sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +91 -456
- sglang/srt/server_args.py +79 -49
- sglang/srt/utils.py +212 -47
- sglang/srt/weight_utils.py +417 -0
- sglang/test/test_programs.py +8 -7
- sglang/test/test_utils.py +195 -7
- sglang/utils.py +77 -26
- {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/METADATA +20 -18
- sglang-0.1.16.dist-info/RECORD +72 -0
- sglang-0.1.14.dist-info/RECORD +0 -64
- {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/LICENSE +0 -0
- {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/WHEEL +0 -0
- {sglang-0.1.14.dist-info → sglang-0.1.16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,356 @@
|
|
1
|
+
"""Conversion between OpenAI APIs and native SRT APIs"""
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
|
5
|
+
from fastapi import HTTPException, Request
|
6
|
+
from fastapi.responses import StreamingResponse
|
7
|
+
|
8
|
+
from sglang.srt.conversation import (
|
9
|
+
Conversation,
|
10
|
+
SeparatorStyle,
|
11
|
+
chat_template_exists,
|
12
|
+
generate_chat_conv,
|
13
|
+
register_conv_template,
|
14
|
+
)
|
15
|
+
from sglang.srt.managers.io_struct import GenerateReqInput
|
16
|
+
from sglang.srt.openai_protocol import (
|
17
|
+
ChatCompletionRequest,
|
18
|
+
ChatCompletionResponse,
|
19
|
+
ChatCompletionResponseChoice,
|
20
|
+
ChatCompletionResponseStreamChoice,
|
21
|
+
ChatCompletionStreamResponse,
|
22
|
+
ChatMessage,
|
23
|
+
CompletionRequest,
|
24
|
+
CompletionResponse,
|
25
|
+
CompletionResponseChoice,
|
26
|
+
CompletionResponseStreamChoice,
|
27
|
+
CompletionStreamResponse,
|
28
|
+
DeltaMessage,
|
29
|
+
LogProbs,
|
30
|
+
UsageInfo,
|
31
|
+
)
|
32
|
+
from sglang.srt.utils import jsonify_pydantic_model
|
33
|
+
|
34
|
+
|
35
|
+
chat_template_name = None
|
36
|
+
|
37
|
+
def load_chat_template_for_openai_api(chat_template_arg):
|
38
|
+
global chat_template_name
|
39
|
+
|
40
|
+
print(f"Use chat template: {chat_template_arg}")
|
41
|
+
if not chat_template_exists(chat_template_arg):
|
42
|
+
if not os.path.exists(chat_template_arg):
|
43
|
+
raise RuntimeError(
|
44
|
+
f"Chat template {chat_template_arg} is not a built-in template name "
|
45
|
+
"or a valid chat template file path."
|
46
|
+
)
|
47
|
+
with open(chat_template_arg, "r") as filep:
|
48
|
+
template = json.load(filep)
|
49
|
+
try:
|
50
|
+
sep_style = SeparatorStyle[template["sep_style"]]
|
51
|
+
except KeyError:
|
52
|
+
raise ValueError(
|
53
|
+
f"Unknown separator style: {template['sep_style']}"
|
54
|
+
) from None
|
55
|
+
register_conv_template(
|
56
|
+
Conversation(
|
57
|
+
name=template["name"],
|
58
|
+
system_template=template["system"] + "\n{system_message}",
|
59
|
+
system_message=template.get("system_message", ""),
|
60
|
+
roles=(template["user"], template["assistant"]),
|
61
|
+
sep_style=sep_style,
|
62
|
+
sep=template.get("sep", "\n"),
|
63
|
+
stop_str=template["stop_str"],
|
64
|
+
),
|
65
|
+
override=True,
|
66
|
+
)
|
67
|
+
chat_template_name = template["name"]
|
68
|
+
else:
|
69
|
+
chat_template_name = chat_template_arg
|
70
|
+
|
71
|
+
|
72
|
+
async def v1_completions(tokenizer_manager, raw_request: Request):
|
73
|
+
request_json = await raw_request.json()
|
74
|
+
request = CompletionRequest(**request_json)
|
75
|
+
|
76
|
+
# TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
|
77
|
+
assert request.n == 1
|
78
|
+
|
79
|
+
adapted_request = GenerateReqInput(
|
80
|
+
text=request.prompt,
|
81
|
+
sampling_params={
|
82
|
+
"temperature": request.temperature,
|
83
|
+
"max_new_tokens": request.max_tokens,
|
84
|
+
"stop": request.stop,
|
85
|
+
"top_p": request.top_p,
|
86
|
+
"presence_penalty": request.presence_penalty,
|
87
|
+
"frequency_penalty": request.frequency_penalty,
|
88
|
+
"regex": request.regex,
|
89
|
+
},
|
90
|
+
return_logprob=request.logprobs is not None and request.logprobs > 0,
|
91
|
+
top_logprobs_num=request.logprobs if request.logprobs is not None else 0,
|
92
|
+
return_text_in_logprobs=True,
|
93
|
+
stream=request.stream,
|
94
|
+
)
|
95
|
+
adapted_request.post_init()
|
96
|
+
|
97
|
+
if adapted_request.stream:
|
98
|
+
|
99
|
+
async def generate_stream_resp():
|
100
|
+
stream_buffer = ""
|
101
|
+
n_prev_token = 0
|
102
|
+
async for content in tokenizer_manager.generate_request(adapted_request):
|
103
|
+
text = content["text"]
|
104
|
+
prompt_tokens = content["meta_info"]["prompt_tokens"]
|
105
|
+
completion_tokens = content["meta_info"]["completion_tokens"]
|
106
|
+
|
107
|
+
if not stream_buffer: # The first chunk
|
108
|
+
if request.echo:
|
109
|
+
# Prepend prompt in response text.
|
110
|
+
text = request.prompt + text
|
111
|
+
|
112
|
+
if request.logprobs:
|
113
|
+
# The first chunk and echo is enabled.
|
114
|
+
if not stream_buffer and request.echo:
|
115
|
+
prefill_token_logprobs = content["meta_info"][
|
116
|
+
"prefill_token_logprobs"
|
117
|
+
]
|
118
|
+
prefill_top_logprobs = content["meta_info"][
|
119
|
+
"prefill_top_logprobs"
|
120
|
+
]
|
121
|
+
else:
|
122
|
+
prefill_token_logprobs = None
|
123
|
+
prefill_top_logprobs = None
|
124
|
+
|
125
|
+
logprobs = to_openai_style_logprobs(
|
126
|
+
prefill_token_logprobs=prefill_token_logprobs,
|
127
|
+
prefill_top_logprobs=prefill_top_logprobs,
|
128
|
+
decode_token_logprobs=content["meta_info"][
|
129
|
+
"decode_token_logprobs"
|
130
|
+
][n_prev_token:],
|
131
|
+
decode_top_logprobs=content["meta_info"]["decode_top_logprobs"][
|
132
|
+
n_prev_token:
|
133
|
+
],
|
134
|
+
)
|
135
|
+
|
136
|
+
n_prev_token = len(content["meta_info"]["decode_token_logprobs"])
|
137
|
+
else:
|
138
|
+
logprobs = None
|
139
|
+
|
140
|
+
delta = text[len(stream_buffer) :]
|
141
|
+
stream_buffer = content["text"]
|
142
|
+
choice_data = CompletionResponseStreamChoice(
|
143
|
+
index=0,
|
144
|
+
text=delta,
|
145
|
+
logprobs=logprobs,
|
146
|
+
finish_reason=None,
|
147
|
+
)
|
148
|
+
chunk = CompletionStreamResponse(
|
149
|
+
id=content["meta_info"]["id"],
|
150
|
+
object="text_completion",
|
151
|
+
choices=[choice_data],
|
152
|
+
model=request.model,
|
153
|
+
usage=UsageInfo(
|
154
|
+
prompt_tokens=prompt_tokens,
|
155
|
+
completion_tokens=completion_tokens,
|
156
|
+
total_tokens=prompt_tokens + completion_tokens,
|
157
|
+
),
|
158
|
+
)
|
159
|
+
yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
|
160
|
+
yield "data: [DONE]\n\n"
|
161
|
+
|
162
|
+
return StreamingResponse(generate_stream_resp(), media_type="text/event-stream")
|
163
|
+
|
164
|
+
# Non-streaming response.
|
165
|
+
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
166
|
+
ret = ret[0] if isinstance(ret, list) else ret
|
167
|
+
|
168
|
+
prompt_tokens = ret["meta_info"]["prompt_tokens"]
|
169
|
+
completion_tokens = ret["meta_info"]["completion_tokens"]
|
170
|
+
text = ret["text"]
|
171
|
+
if request.echo:
|
172
|
+
text = request.prompt + text
|
173
|
+
|
174
|
+
if request.logprobs:
|
175
|
+
if request.echo:
|
176
|
+
prefill_token_logprobs = ret["meta_info"]["prefill_token_logprobs"]
|
177
|
+
prefill_top_logprobs = ret["meta_info"]["prefill_top_logprobs"]
|
178
|
+
else:
|
179
|
+
prefill_token_logprobs = None
|
180
|
+
prefill_top_logprobs = None
|
181
|
+
|
182
|
+
logprobs = to_openai_style_logprobs(
|
183
|
+
prefill_token_logprobs=prefill_token_logprobs,
|
184
|
+
prefill_top_logprobs=prefill_top_logprobs,
|
185
|
+
decode_token_logprobs=ret["meta_info"]["decode_token_logprobs"],
|
186
|
+
decode_top_logprobs=ret["meta_info"]["decode_top_logprobs"],
|
187
|
+
)
|
188
|
+
else:
|
189
|
+
logprobs = None
|
190
|
+
|
191
|
+
choice_data = CompletionResponseChoice(
|
192
|
+
index=0,
|
193
|
+
text=text,
|
194
|
+
logprobs=logprobs,
|
195
|
+
finish_reason=None, # TODO(comaniac): Add finish reason.
|
196
|
+
)
|
197
|
+
response = CompletionResponse(
|
198
|
+
id=ret["meta_info"]["id"],
|
199
|
+
model=request.model,
|
200
|
+
choices=[choice_data],
|
201
|
+
usage=UsageInfo(
|
202
|
+
prompt_tokens=prompt_tokens,
|
203
|
+
completion_tokens=completion_tokens,
|
204
|
+
total_tokens=prompt_tokens + completion_tokens,
|
205
|
+
),
|
206
|
+
)
|
207
|
+
return response
|
208
|
+
|
209
|
+
|
210
|
+
async def v1_chat_completions(tokenizer_manager, raw_request: Request):
|
211
|
+
request_json = await raw_request.json()
|
212
|
+
request = ChatCompletionRequest(**request_json)
|
213
|
+
|
214
|
+
# TODO: Validate the request and return HTTPStatus.BAD_REQUEST if invalid.
|
215
|
+
assert request.n == 1
|
216
|
+
|
217
|
+
# Prep the data needed for the underlying GenerateReqInput:
|
218
|
+
# - prompt: The full prompt string.
|
219
|
+
# - stop: Custom stop tokens.
|
220
|
+
# - image_data: None or a list of image strings (URLs or base64 strings).
|
221
|
+
# None skips any image processing in GenerateReqInput.
|
222
|
+
if not isinstance(request.messages, str):
|
223
|
+
# Apply chat template and its stop strings.
|
224
|
+
if chat_template_name is None:
|
225
|
+
prompt = tokenizer_manager.tokenizer.apply_chat_template(
|
226
|
+
request.messages, tokenize=False, add_generation_prompt=True
|
227
|
+
)
|
228
|
+
stop = request.stop
|
229
|
+
image_data = None
|
230
|
+
else:
|
231
|
+
conv = generate_chat_conv(request, chat_template_name)
|
232
|
+
prompt = conv.get_prompt()
|
233
|
+
image_data = conv.image_data
|
234
|
+
stop = conv.stop_str or []
|
235
|
+
if request.stop:
|
236
|
+
if isinstance(request.stop, str):
|
237
|
+
stop.append(request.stop)
|
238
|
+
else:
|
239
|
+
stop.extend(request.stop)
|
240
|
+
else:
|
241
|
+
# Use the raw prompt and stop strings if the messages is already a string.
|
242
|
+
prompt = request.messages
|
243
|
+
stop = request.stop
|
244
|
+
image_data = None
|
245
|
+
|
246
|
+
adapted_request = GenerateReqInput(
|
247
|
+
text=prompt,
|
248
|
+
image_data=image_data,
|
249
|
+
sampling_params={
|
250
|
+
"temperature": request.temperature,
|
251
|
+
"max_new_tokens": request.max_tokens,
|
252
|
+
"stop": stop,
|
253
|
+
"top_p": request.top_p,
|
254
|
+
"presence_penalty": request.presence_penalty,
|
255
|
+
"frequency_penalty": request.frequency_penalty,
|
256
|
+
"regex": request.regex,
|
257
|
+
},
|
258
|
+
stream=request.stream,
|
259
|
+
)
|
260
|
+
adapted_request.post_init()
|
261
|
+
|
262
|
+
if adapted_request.stream:
|
263
|
+
|
264
|
+
async def generate_stream_resp():
|
265
|
+
is_first = True
|
266
|
+
|
267
|
+
stream_buffer = ""
|
268
|
+
async for content in tokenizer_manager.generate_request(adapted_request):
|
269
|
+
if is_first:
|
270
|
+
# First chunk with role
|
271
|
+
is_first = False
|
272
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
273
|
+
index=0,
|
274
|
+
delta=DeltaMessage(role="assistant"),
|
275
|
+
finish_reason=None,
|
276
|
+
)
|
277
|
+
chunk = ChatCompletionStreamResponse(
|
278
|
+
id=content["meta_info"]["id"],
|
279
|
+
choices=[choice_data],
|
280
|
+
model=request.model,
|
281
|
+
)
|
282
|
+
yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
|
283
|
+
|
284
|
+
text = content["text"]
|
285
|
+
delta = text[len(stream_buffer) :]
|
286
|
+
stream_buffer = text
|
287
|
+
choice_data = ChatCompletionResponseStreamChoice(
|
288
|
+
index=0, delta=DeltaMessage(content=delta), finish_reason=None
|
289
|
+
)
|
290
|
+
chunk = ChatCompletionStreamResponse(
|
291
|
+
id=content["meta_info"]["id"],
|
292
|
+
choices=[choice_data],
|
293
|
+
model=request.model,
|
294
|
+
)
|
295
|
+
yield f"data: {jsonify_pydantic_model(chunk)}\n\n"
|
296
|
+
yield "data: [DONE]\n\n"
|
297
|
+
|
298
|
+
return StreamingResponse(generate_stream_resp(), media_type="text/event-stream")
|
299
|
+
|
300
|
+
# Non-streaming response.
|
301
|
+
ret = await tokenizer_manager.generate_request(adapted_request).__anext__()
|
302
|
+
prompt_tokens = ret["meta_info"]["prompt_tokens"]
|
303
|
+
completion_tokens = ret["meta_info"]["completion_tokens"]
|
304
|
+
choice_data = ChatCompletionResponseChoice(
|
305
|
+
index=0,
|
306
|
+
message=ChatMessage(role="assistant", content=ret["text"]),
|
307
|
+
finish_reason=None, # TODO(comaniac): Add finish reason.
|
308
|
+
)
|
309
|
+
response = ChatCompletionResponse(
|
310
|
+
id=ret["meta_info"]["id"],
|
311
|
+
model=request.model,
|
312
|
+
choices=[choice_data],
|
313
|
+
usage=UsageInfo(
|
314
|
+
prompt_tokens=prompt_tokens,
|
315
|
+
completion_tokens=completion_tokens,
|
316
|
+
total_tokens=prompt_tokens + completion_tokens,
|
317
|
+
),
|
318
|
+
)
|
319
|
+
return response
|
320
|
+
|
321
|
+
|
322
|
+
def to_openai_style_logprobs(
|
323
|
+
prefill_token_logprobs=None,
|
324
|
+
decode_token_logprobs=None,
|
325
|
+
prefill_top_logprobs=None,
|
326
|
+
decode_top_logprobs=None,
|
327
|
+
):
|
328
|
+
ret_logprobs = LogProbs()
|
329
|
+
|
330
|
+
def append_token_logprobs(token_logprobs):
|
331
|
+
for logprob, _, token_text in token_logprobs:
|
332
|
+
ret_logprobs.tokens.append(token_text)
|
333
|
+
ret_logprobs.token_logprobs.append(logprob)
|
334
|
+
|
335
|
+
# Not Supported yet
|
336
|
+
ret_logprobs.text_offset.append(-1)
|
337
|
+
|
338
|
+
def append_top_logprobs(top_logprobs):
|
339
|
+
for tokens in top_logprobs:
|
340
|
+
if tokens is not None:
|
341
|
+
ret_logprobs.top_logprobs.append(
|
342
|
+
{token[2]: token[0] for token in tokens}
|
343
|
+
)
|
344
|
+
else:
|
345
|
+
ret_logprobs.top_logprobs.append(None)
|
346
|
+
|
347
|
+
if prefill_token_logprobs is not None:
|
348
|
+
append_token_logprobs(prefill_token_logprobs)
|
349
|
+
if decode_token_logprobs is not None:
|
350
|
+
append_token_logprobs(decode_token_logprobs)
|
351
|
+
if prefill_top_logprobs is not None:
|
352
|
+
append_top_logprobs(prefill_top_logprobs)
|
353
|
+
if decode_top_logprobs is not None:
|
354
|
+
append_top_logprobs(decode_top_logprobs)
|
355
|
+
|
356
|
+
return ret_logprobs
|
@@ -1,3 +1,4 @@
|
|
1
|
+
"""pydantic models for OpenAI API protocol"""
|
1
2
|
import time
|
2
3
|
from typing import Dict, List, Optional, Union
|
3
4
|
|
@@ -19,21 +20,24 @@ class UsageInfo(BaseModel):
|
|
19
20
|
|
20
21
|
|
21
22
|
class CompletionRequest(BaseModel):
|
23
|
+
# Ordered by official OpenAI API documentation
|
24
|
+
# https://platform.openai.com/docs/api-reference/completions/create
|
22
25
|
model: str
|
23
|
-
prompt: Union[str, List[str]]
|
24
|
-
|
25
|
-
max_tokens: Optional[int] = 16
|
26
|
-
temperature: Optional[float] = 0.7
|
27
|
-
top_p: Optional[float] = 1.0
|
28
|
-
n: Optional[int] = 1
|
29
|
-
stream: Optional[bool] = False
|
30
|
-
logprobs: Optional[int] = None
|
26
|
+
prompt: Union[List[int], List[List[int]], str, List[str]]
|
27
|
+
best_of: Optional[int] = None
|
31
28
|
echo: Optional[bool] = False
|
32
|
-
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
33
|
-
presence_penalty: Optional[float] = 0.0
|
34
29
|
frequency_penalty: Optional[float] = 0.0
|
35
|
-
best_of: Optional[int] = None
|
36
30
|
logit_bias: Optional[Dict[str, float]] = None
|
31
|
+
logprobs: Optional[int] = None
|
32
|
+
max_tokens: Optional[int] = 16
|
33
|
+
n: int = 1
|
34
|
+
presence_penalty: Optional[float] = 0.0
|
35
|
+
seed: Optional[int] = None
|
36
|
+
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
37
|
+
stream: Optional[bool] = False
|
38
|
+
suffix: Optional[str] = None
|
39
|
+
temperature: Optional[float] = 1.0
|
40
|
+
top_p: Optional[float] = 1.0
|
37
41
|
user: Optional[str] = None
|
38
42
|
|
39
43
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
@@ -107,20 +111,30 @@ ChatCompletionMessageParam = Union[
|
|
107
111
|
]
|
108
112
|
|
109
113
|
|
114
|
+
class ResponseFormat(BaseModel):
|
115
|
+
# type must be "json_object" or "text"
|
116
|
+
type: Literal["text", "json_object"]
|
117
|
+
|
118
|
+
|
110
119
|
class ChatCompletionRequest(BaseModel):
|
120
|
+
# Ordered by official OpenAI API documentation
|
121
|
+
# https://platform.openai.com/docs/api-reference/chat/create
|
122
|
+
messages: List[ChatCompletionMessageParam]
|
111
123
|
model: str
|
112
|
-
|
113
|
-
|
114
|
-
|
124
|
+
frequency_penalty: Optional[float] = 0.0
|
125
|
+
logit_bias: Optional[Dict[str, float]] = None
|
126
|
+
logprobs: Optional[bool] = False
|
127
|
+
top_logprobs: Optional[int] = None
|
128
|
+
max_tokens: Optional[int] = None
|
115
129
|
n: Optional[int] = 1
|
116
|
-
|
130
|
+
presence_penalty: Optional[float] = 0.0
|
131
|
+
response_format: Optional[ResponseFormat] = None
|
132
|
+
seed: Optional[int] = None
|
117
133
|
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
118
134
|
stream: Optional[bool] = False
|
119
|
-
|
120
|
-
|
121
|
-
logit_bias: Optional[Dict[str, float]] = None
|
135
|
+
temperature: Optional[float] = 0.7
|
136
|
+
top_p: Optional[float] = 1.0
|
122
137
|
user: Optional[str] = None
|
123
|
-
best_of: Optional[int] = None
|
124
138
|
|
125
139
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
126
140
|
regex: Optional[str] = None
|
@@ -134,6 +148,7 @@ class ChatMessage(BaseModel):
|
|
134
148
|
class ChatCompletionResponseChoice(BaseModel):
|
135
149
|
index: int
|
136
150
|
message: ChatMessage
|
151
|
+
logprobs: Optional[LogProbs] = None
|
137
152
|
finish_reason: Optional[str] = None
|
138
153
|
|
139
154
|
|
@@ -154,6 +169,7 @@ class DeltaMessage(BaseModel):
|
|
154
169
|
class ChatCompletionResponseStreamChoice(BaseModel):
|
155
170
|
index: int
|
156
171
|
delta: DeltaMessage
|
172
|
+
logprobs: Optional[LogProbs] = None
|
157
173
|
finish_reason: Optional[str] = None
|
158
174
|
|
159
175
|
|
@@ -162,4 +178,4 @@ class ChatCompletionStreamResponse(BaseModel):
|
|
162
178
|
object: str = "chat.completion.chunk"
|
163
179
|
created: int = Field(default_factory=lambda: int(time.time()))
|
164
180
|
model: str
|
165
|
-
choices: List[ChatCompletionResponseStreamChoice]
|
181
|
+
choices: List[ChatCompletionResponseStreamChoice]
|
sglang/srt/sampling_params.py
CHANGED
@@ -17,6 +17,7 @@ class SamplingParams:
|
|
17
17
|
presence_penalty: float = 0.0,
|
18
18
|
ignore_eos: bool = False,
|
19
19
|
skip_special_tokens: bool = True,
|
20
|
+
spaces_between_special_tokens: bool = True,
|
20
21
|
dtype: Optional[str] = None,
|
21
22
|
regex: Optional[str] = None,
|
22
23
|
) -> None:
|
@@ -29,6 +30,7 @@ class SamplingParams:
|
|
29
30
|
self.max_new_tokens = max_new_tokens
|
30
31
|
self.ignore_eos = ignore_eos
|
31
32
|
self.skip_special_tokens = skip_special_tokens
|
33
|
+
self.spaces_between_special_tokens = spaces_between_special_tokens
|
32
34
|
self.dtype = dtype
|
33
35
|
self.regex = regex
|
34
36
|
|