sglang 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. sglang/__init__.py +55 -2
  2. sglang/api.py +3 -5
  3. sglang/backend/anthropic.py +18 -4
  4. sglang/backend/openai.py +2 -1
  5. sglang/backend/runtime_endpoint.py +18 -5
  6. sglang/backend/vertexai.py +1 -0
  7. sglang/global_config.py +1 -0
  8. sglang/lang/chat_template.py +74 -0
  9. sglang/lang/interpreter.py +40 -16
  10. sglang/lang/tracer.py +6 -4
  11. sglang/launch_server.py +2 -1
  12. sglang/srt/constrained/fsm_cache.py +1 -0
  13. sglang/srt/constrained/jump_forward.py +1 -0
  14. sglang/srt/conversation.py +2 -2
  15. sglang/srt/hf_transformers_utils.py +2 -1
  16. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  17. sglang/srt/layers/extend_attention.py +1 -0
  18. sglang/srt/layers/logits_processor.py +114 -54
  19. sglang/srt/layers/radix_attention.py +2 -1
  20. sglang/srt/layers/token_attention.py +1 -0
  21. sglang/srt/managers/detokenizer_manager.py +5 -1
  22. sglang/srt/managers/io_struct.py +12 -0
  23. sglang/srt/managers/router/infer_batch.py +70 -33
  24. sglang/srt/managers/router/manager.py +7 -2
  25. sglang/srt/managers/router/model_rpc.py +116 -73
  26. sglang/srt/managers/router/model_runner.py +111 -167
  27. sglang/srt/managers/router/radix_cache.py +46 -38
  28. sglang/srt/managers/tokenizer_manager.py +56 -11
  29. sglang/srt/memory_pool.py +5 -14
  30. sglang/srt/model_config.py +7 -0
  31. sglang/srt/models/commandr.py +376 -0
  32. sglang/srt/models/dbrx.py +413 -0
  33. sglang/srt/models/dbrx_config.py +281 -0
  34. sglang/srt/models/gemma.py +22 -20
  35. sglang/srt/models/llama2.py +23 -21
  36. sglang/srt/models/llava.py +12 -10
  37. sglang/srt/models/mixtral.py +27 -25
  38. sglang/srt/models/qwen.py +23 -21
  39. sglang/srt/models/qwen2.py +23 -21
  40. sglang/srt/models/stablelm.py +20 -21
  41. sglang/srt/models/yivl.py +6 -5
  42. sglang/srt/openai_api_adapter.py +356 -0
  43. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
  44. sglang/srt/sampling_params.py +2 -0
  45. sglang/srt/server.py +68 -447
  46. sglang/srt/server_args.py +76 -49
  47. sglang/srt/utils.py +88 -32
  48. sglang/srt/weight_utils.py +402 -0
  49. sglang/test/test_programs.py +8 -7
  50. sglang/test/test_utils.py +195 -7
  51. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/METADATA +12 -14
  52. sglang-0.1.15.dist-info/RECORD +69 -0
  53. sglang-0.1.14.dist-info/RECORD +0 -64
  54. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
  55. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/WHEEL +0 -0
  56. {sglang-0.1.14.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
1
+ """pydantic models for OpenAI API protocol"""
1
2
  import time
2
3
  from typing import Dict, List, Optional, Union
3
4
 
@@ -19,21 +20,24 @@ class UsageInfo(BaseModel):
19
20
 
20
21
 
21
22
  class CompletionRequest(BaseModel):
23
+ # Ordered by official OpenAI API documentation
24
+ # https://platform.openai.com/docs/api-reference/completions/create
22
25
  model: str
23
- prompt: Union[str, List[str]]
24
- suffix: Optional[str] = None
25
- max_tokens: Optional[int] = 16
26
- temperature: Optional[float] = 0.7
27
- top_p: Optional[float] = 1.0
28
- n: Optional[int] = 1
29
- stream: Optional[bool] = False
30
- logprobs: Optional[int] = None
26
+ prompt: Union[List[int], List[List[int]], str, List[str]]
27
+ best_of: Optional[int] = None
31
28
  echo: Optional[bool] = False
32
- stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
33
- presence_penalty: Optional[float] = 0.0
34
29
  frequency_penalty: Optional[float] = 0.0
35
- best_of: Optional[int] = None
36
30
  logit_bias: Optional[Dict[str, float]] = None
31
+ logprobs: Optional[int] = None
32
+ max_tokens: Optional[int] = 16
33
+ n: int = 1
34
+ presence_penalty: Optional[float] = 0.0
35
+ seed: Optional[int] = None
36
+ stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
37
+ stream: Optional[bool] = False
38
+ suffix: Optional[str] = None
39
+ temperature: Optional[float] = 1.0
40
+ top_p: Optional[float] = 1.0
37
41
  user: Optional[str] = None
38
42
 
39
43
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
@@ -107,20 +111,30 @@ ChatCompletionMessageParam = Union[
107
111
  ]
108
112
 
109
113
 
114
+ class ResponseFormat(BaseModel):
115
+ # type must be "json_object" or "text"
116
+ type: Literal["text", "json_object"]
117
+
118
+
110
119
  class ChatCompletionRequest(BaseModel):
120
+ # Ordered by official OpenAI API documentation
121
+ # https://platform.openai.com/docs/api-reference/chat/create
122
+ messages: List[ChatCompletionMessageParam]
111
123
  model: str
112
- messages: Union[str, List[ChatCompletionMessageParam]]
113
- temperature: Optional[float] = 0.7
114
- top_p: Optional[float] = 1.0
124
+ frequency_penalty: Optional[float] = 0.0
125
+ logit_bias: Optional[Dict[str, float]] = None
126
+ logprobs: Optional[bool] = False
127
+ top_logprobs: Optional[int] = None
128
+ max_tokens: Optional[int] = None
115
129
  n: Optional[int] = 1
116
- max_tokens: Optional[int] = 16
130
+ presence_penalty: Optional[float] = 0.0
131
+ response_format: Optional[ResponseFormat] = None
132
+ seed: Optional[int] = None
117
133
  stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
118
134
  stream: Optional[bool] = False
119
- presence_penalty: Optional[float] = 0.0
120
- frequency_penalty: Optional[float] = 0.0
121
- logit_bias: Optional[Dict[str, float]] = None
135
+ temperature: Optional[float] = 0.7
136
+ top_p: Optional[float] = 1.0
122
137
  user: Optional[str] = None
123
- best_of: Optional[int] = None
124
138
 
125
139
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
126
140
  regex: Optional[str] = None
@@ -134,6 +148,7 @@ class ChatMessage(BaseModel):
134
148
  class ChatCompletionResponseChoice(BaseModel):
135
149
  index: int
136
150
  message: ChatMessage
151
+ logprobs: Optional[LogProbs] = None
137
152
  finish_reason: Optional[str] = None
138
153
 
139
154
 
@@ -154,6 +169,7 @@ class DeltaMessage(BaseModel):
154
169
  class ChatCompletionResponseStreamChoice(BaseModel):
155
170
  index: int
156
171
  delta: DeltaMessage
172
+ logprobs: Optional[LogProbs] = None
157
173
  finish_reason: Optional[str] = None
158
174
 
159
175
 
@@ -162,4 +178,4 @@ class ChatCompletionStreamResponse(BaseModel):
162
178
  object: str = "chat.completion.chunk"
163
179
  created: int = Field(default_factory=lambda: int(time.time()))
164
180
  model: str
165
- choices: List[ChatCompletionResponseStreamChoice]
181
+ choices: List[ChatCompletionResponseStreamChoice]
@@ -17,6 +17,7 @@ class SamplingParams:
17
17
  presence_penalty: float = 0.0,
18
18
  ignore_eos: bool = False,
19
19
  skip_special_tokens: bool = True,
20
+ spaces_between_special_tokens: bool = True,
20
21
  dtype: Optional[str] = None,
21
22
  regex: Optional[str] = None,
22
23
  ) -> None:
@@ -29,6 +30,7 @@ class SamplingParams:
29
30
  self.max_new_tokens = max_new_tokens
30
31
  self.ignore_eos = ignore_eos
31
32
  self.skip_special_tokens = skip_special_tokens
33
+ self.spaces_between_special_tokens = spaces_between_special_tokens
32
34
  self.dtype = dtype
33
35
  self.regex = regex
34
36