sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +59 -2
- sglang/api.py +40 -11
- sglang/backend/anthropic.py +17 -3
- sglang/backend/litellm.py +90 -0
- sglang/backend/openai.py +160 -12
- sglang/backend/runtime_endpoint.py +62 -27
- sglang/backend/vertexai.py +1 -0
- sglang/bench_latency.py +320 -0
- sglang/global_config.py +24 -3
- sglang/lang/chat_template.py +122 -6
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +206 -98
- sglang/lang/ir.py +98 -34
- sglang/lang/tracer.py +6 -4
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +32 -0
- sglang/srt/constrained/__init__.py +14 -6
- sglang/srt/constrained/fsm_cache.py +9 -2
- sglang/srt/constrained/jump_forward.py +113 -24
- sglang/srt/conversation.py +4 -2
- sglang/srt/flush_cache.py +18 -0
- sglang/srt/hf_transformers_utils.py +144 -3
- sglang/srt/layers/context_flashattention_nopad.py +1 -0
- sglang/srt/layers/extend_attention.py +20 -1
- sglang/srt/layers/fused_moe.py +596 -0
- sglang/srt/layers/logits_processor.py +190 -61
- sglang/srt/layers/radix_attention.py +62 -53
- sglang/srt/layers/token_attention.py +21 -9
- sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
- sglang/srt/managers/controller/dp_worker.py +113 -0
- sglang/srt/managers/controller/infer_batch.py +908 -0
- sglang/srt/managers/controller/manager_multi.py +195 -0
- sglang/srt/managers/controller/manager_single.py +177 -0
- sglang/srt/managers/controller/model_runner.py +359 -0
- sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
- sglang/srt/managers/controller/schedule_heuristic.py +65 -0
- sglang/srt/managers/controller/tp_worker.py +813 -0
- sglang/srt/managers/detokenizer_manager.py +42 -40
- sglang/srt/managers/io_struct.py +44 -10
- sglang/srt/managers/tokenizer_manager.py +224 -82
- sglang/srt/memory_pool.py +52 -59
- sglang/srt/model_config.py +97 -2
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +369 -0
- sglang/srt/models/dbrx.py +406 -0
- sglang/srt/models/gemma.py +34 -38
- sglang/srt/models/gemma2.py +436 -0
- sglang/srt/models/grok.py +738 -0
- sglang/srt/models/llama2.py +47 -37
- sglang/srt/models/llama_classification.py +107 -0
- sglang/srt/models/llava.py +92 -27
- sglang/srt/models/llavavid.py +298 -0
- sglang/srt/models/minicpm.py +366 -0
- sglang/srt/models/mixtral.py +302 -127
- sglang/srt/models/mixtral_quant.py +372 -0
- sglang/srt/models/qwen.py +40 -35
- sglang/srt/models/qwen2.py +33 -36
- sglang/srt/models/qwen2_moe.py +473 -0
- sglang/srt/models/stablelm.py +33 -39
- sglang/srt/models/yivl.py +19 -26
- sglang/srt/openai_api_adapter.py +411 -0
- sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
- sglang/srt/sampling_params.py +2 -0
- sglang/srt/server.py +197 -481
- sglang/srt/server_args.py +190 -74
- sglang/srt/utils.py +460 -95
- sglang/test/test_programs.py +73 -10
- sglang/test/test_utils.py +226 -7
- sglang/utils.py +97 -27
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
- sglang-0.1.21.dist-info/RECORD +82 -0
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
- sglang/srt/backend_config.py +0 -13
- sglang/srt/managers/router/infer_batch.py +0 -503
- sglang/srt/managers/router/manager.py +0 -79
- sglang/srt/managers/router/model_rpc.py +0 -686
- sglang/srt/managers/router/model_runner.py +0 -514
- sglang/srt/managers/router/scheduler.py +0 -70
- sglang-0.1.14.dist-info/RECORD +0 -64
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
- {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
|
|
1
|
+
"""Pydantic models for OpenAI API protocol"""
|
2
|
+
|
1
3
|
import time
|
2
4
|
from typing import Dict, List, Optional, Union
|
3
5
|
|
@@ -5,6 +7,14 @@ from pydantic import BaseModel, Field
|
|
5
7
|
from typing_extensions import Literal
|
6
8
|
|
7
9
|
|
10
|
+
class ErrorResponse(BaseModel):
|
11
|
+
object: str = "error"
|
12
|
+
message: str
|
13
|
+
type: str
|
14
|
+
param: Optional[str] = None
|
15
|
+
code: int
|
16
|
+
|
17
|
+
|
8
18
|
class LogProbs(BaseModel):
|
9
19
|
text_offset: List[int] = Field(default_factory=list)
|
10
20
|
token_logprobs: List[Optional[float]] = Field(default_factory=list)
|
@@ -19,21 +29,24 @@ class UsageInfo(BaseModel):
|
|
19
29
|
|
20
30
|
|
21
31
|
class CompletionRequest(BaseModel):
|
32
|
+
# Ordered by official OpenAI API documentation
|
33
|
+
# https://platform.openai.com/docs/api-reference/completions/create
|
22
34
|
model: str
|
23
|
-
prompt: Union[str, List[str]]
|
24
|
-
|
25
|
-
max_tokens: Optional[int] = 16
|
26
|
-
temperature: Optional[float] = 0.7
|
27
|
-
top_p: Optional[float] = 1.0
|
28
|
-
n: Optional[int] = 1
|
29
|
-
stream: Optional[bool] = False
|
30
|
-
logprobs: Optional[int] = None
|
35
|
+
prompt: Union[List[int], List[List[int]], str, List[str]]
|
36
|
+
best_of: Optional[int] = None
|
31
37
|
echo: Optional[bool] = False
|
32
|
-
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
33
|
-
presence_penalty: Optional[float] = 0.0
|
34
38
|
frequency_penalty: Optional[float] = 0.0
|
35
|
-
best_of: Optional[int] = None
|
36
39
|
logit_bias: Optional[Dict[str, float]] = None
|
40
|
+
logprobs: Optional[int] = None
|
41
|
+
max_tokens: Optional[int] = 16
|
42
|
+
n: int = 1
|
43
|
+
presence_penalty: Optional[float] = 0.0
|
44
|
+
seed: Optional[int] = None
|
45
|
+
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
46
|
+
stream: Optional[bool] = False
|
47
|
+
suffix: Optional[str] = None
|
48
|
+
temperature: Optional[float] = 1.0
|
49
|
+
top_p: Optional[float] = 1.0
|
37
50
|
user: Optional[str] = None
|
38
51
|
|
39
52
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
@@ -107,20 +120,30 @@ ChatCompletionMessageParam = Union[
|
|
107
120
|
]
|
108
121
|
|
109
122
|
|
123
|
+
class ResponseFormat(BaseModel):
|
124
|
+
# type must be "json_object" or "text"
|
125
|
+
type: Literal["text", "json_object"]
|
126
|
+
|
127
|
+
|
110
128
|
class ChatCompletionRequest(BaseModel):
|
129
|
+
# Ordered by official OpenAI API documentation
|
130
|
+
# https://platform.openai.com/docs/api-reference/chat/create
|
131
|
+
messages: List[ChatCompletionMessageParam]
|
111
132
|
model: str
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
133
|
+
frequency_penalty: Optional[float] = 0.0
|
134
|
+
logit_bias: Optional[Dict[str, float]] = None
|
135
|
+
logprobs: Optional[bool] = False
|
136
|
+
top_logprobs: Optional[int] = None
|
116
137
|
max_tokens: Optional[int] = 16
|
138
|
+
n: Optional[int] = 1
|
139
|
+
presence_penalty: Optional[float] = 0.0
|
140
|
+
response_format: Optional[ResponseFormat] = None
|
141
|
+
seed: Optional[int] = None
|
117
142
|
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
|
118
143
|
stream: Optional[bool] = False
|
119
|
-
|
120
|
-
|
121
|
-
logit_bias: Optional[Dict[str, float]] = None
|
144
|
+
temperature: Optional[float] = 0.7
|
145
|
+
top_p: Optional[float] = 1.0
|
122
146
|
user: Optional[str] = None
|
123
|
-
best_of: Optional[int] = None
|
124
147
|
|
125
148
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
126
149
|
regex: Optional[str] = None
|
@@ -134,6 +157,7 @@ class ChatMessage(BaseModel):
|
|
134
157
|
class ChatCompletionResponseChoice(BaseModel):
|
135
158
|
index: int
|
136
159
|
message: ChatMessage
|
160
|
+
logprobs: Optional[LogProbs] = None
|
137
161
|
finish_reason: Optional[str] = None
|
138
162
|
|
139
163
|
|
@@ -154,6 +178,7 @@ class DeltaMessage(BaseModel):
|
|
154
178
|
class ChatCompletionResponseStreamChoice(BaseModel):
|
155
179
|
index: int
|
156
180
|
delta: DeltaMessage
|
181
|
+
logprobs: Optional[LogProbs] = None
|
157
182
|
finish_reason: Optional[str] = None
|
158
183
|
|
159
184
|
|
sglang/srt/sampling_params.py
CHANGED
@@ -17,6 +17,7 @@ class SamplingParams:
|
|
17
17
|
presence_penalty: float = 0.0,
|
18
18
|
ignore_eos: bool = False,
|
19
19
|
skip_special_tokens: bool = True,
|
20
|
+
spaces_between_special_tokens: bool = True,
|
20
21
|
dtype: Optional[str] = None,
|
21
22
|
regex: Optional[str] = None,
|
22
23
|
) -> None:
|
@@ -29,6 +30,7 @@ class SamplingParams:
|
|
29
30
|
self.max_new_tokens = max_new_tokens
|
30
31
|
self.ignore_eos = ignore_eos
|
31
32
|
self.skip_special_tokens = skip_special_tokens
|
33
|
+
self.spaces_between_special_tokens = spaces_between_special_tokens
|
32
34
|
self.dtype = dtype
|
33
35
|
self.regex = regex
|
34
36
|
|