sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. sglang/__init__.py +59 -2
  2. sglang/api.py +40 -11
  3. sglang/backend/anthropic.py +17 -3
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +160 -12
  6. sglang/backend/runtime_endpoint.py +62 -27
  7. sglang/backend/vertexai.py +1 -0
  8. sglang/bench_latency.py +320 -0
  9. sglang/global_config.py +24 -3
  10. sglang/lang/chat_template.py +122 -6
  11. sglang/lang/compiler.py +2 -2
  12. sglang/lang/interpreter.py +206 -98
  13. sglang/lang/ir.py +98 -34
  14. sglang/lang/tracer.py +6 -4
  15. sglang/launch_server.py +4 -1
  16. sglang/launch_server_llavavid.py +32 -0
  17. sglang/srt/constrained/__init__.py +14 -6
  18. sglang/srt/constrained/fsm_cache.py +9 -2
  19. sglang/srt/constrained/jump_forward.py +113 -24
  20. sglang/srt/conversation.py +4 -2
  21. sglang/srt/flush_cache.py +18 -0
  22. sglang/srt/hf_transformers_utils.py +144 -3
  23. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  24. sglang/srt/layers/extend_attention.py +20 -1
  25. sglang/srt/layers/fused_moe.py +596 -0
  26. sglang/srt/layers/logits_processor.py +190 -61
  27. sglang/srt/layers/radix_attention.py +62 -53
  28. sglang/srt/layers/token_attention.py +21 -9
  29. sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
  30. sglang/srt/managers/controller/dp_worker.py +113 -0
  31. sglang/srt/managers/controller/infer_batch.py +908 -0
  32. sglang/srt/managers/controller/manager_multi.py +195 -0
  33. sglang/srt/managers/controller/manager_single.py +177 -0
  34. sglang/srt/managers/controller/model_runner.py +359 -0
  35. sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
  36. sglang/srt/managers/controller/schedule_heuristic.py +65 -0
  37. sglang/srt/managers/controller/tp_worker.py +813 -0
  38. sglang/srt/managers/detokenizer_manager.py +42 -40
  39. sglang/srt/managers/io_struct.py +44 -10
  40. sglang/srt/managers/tokenizer_manager.py +224 -82
  41. sglang/srt/memory_pool.py +52 -59
  42. sglang/srt/model_config.py +97 -2
  43. sglang/srt/models/chatglm.py +399 -0
  44. sglang/srt/models/commandr.py +369 -0
  45. sglang/srt/models/dbrx.py +406 -0
  46. sglang/srt/models/gemma.py +34 -38
  47. sglang/srt/models/gemma2.py +436 -0
  48. sglang/srt/models/grok.py +738 -0
  49. sglang/srt/models/llama2.py +47 -37
  50. sglang/srt/models/llama_classification.py +107 -0
  51. sglang/srt/models/llava.py +92 -27
  52. sglang/srt/models/llavavid.py +298 -0
  53. sglang/srt/models/minicpm.py +366 -0
  54. sglang/srt/models/mixtral.py +302 -127
  55. sglang/srt/models/mixtral_quant.py +372 -0
  56. sglang/srt/models/qwen.py +40 -35
  57. sglang/srt/models/qwen2.py +33 -36
  58. sglang/srt/models/qwen2_moe.py +473 -0
  59. sglang/srt/models/stablelm.py +33 -39
  60. sglang/srt/models/yivl.py +19 -26
  61. sglang/srt/openai_api_adapter.py +411 -0
  62. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
  63. sglang/srt/sampling_params.py +2 -0
  64. sglang/srt/server.py +197 -481
  65. sglang/srt/server_args.py +190 -74
  66. sglang/srt/utils.py +460 -95
  67. sglang/test/test_programs.py +73 -10
  68. sglang/test/test_utils.py +226 -7
  69. sglang/utils.py +97 -27
  70. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
  71. sglang-0.1.21.dist-info/RECORD +82 -0
  72. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
  73. sglang/srt/backend_config.py +0 -13
  74. sglang/srt/managers/router/infer_batch.py +0 -503
  75. sglang/srt/managers/router/manager.py +0 -79
  76. sglang/srt/managers/router/model_rpc.py +0 -686
  77. sglang/srt/managers/router/model_runner.py +0 -514
  78. sglang/srt/managers/router/scheduler.py +0 -70
  79. sglang-0.1.14.dist-info/RECORD +0 -64
  80. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
  81. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
1
+ """Pydantic models for OpenAI API protocol"""
2
+
1
3
  import time
2
4
  from typing import Dict, List, Optional, Union
3
5
 
@@ -5,6 +7,14 @@ from pydantic import BaseModel, Field
5
7
  from typing_extensions import Literal
6
8
 
7
9
 
10
+ class ErrorResponse(BaseModel):
11
+ object: str = "error"
12
+ message: str
13
+ type: str
14
+ param: Optional[str] = None
15
+ code: int
16
+
17
+
8
18
  class LogProbs(BaseModel):
9
19
  text_offset: List[int] = Field(default_factory=list)
10
20
  token_logprobs: List[Optional[float]] = Field(default_factory=list)
@@ -19,21 +29,24 @@ class UsageInfo(BaseModel):
19
29
 
20
30
 
21
31
  class CompletionRequest(BaseModel):
32
+ # Ordered by official OpenAI API documentation
33
+ # https://platform.openai.com/docs/api-reference/completions/create
22
34
  model: str
23
- prompt: Union[str, List[str]]
24
- suffix: Optional[str] = None
25
- max_tokens: Optional[int] = 16
26
- temperature: Optional[float] = 0.7
27
- top_p: Optional[float] = 1.0
28
- n: Optional[int] = 1
29
- stream: Optional[bool] = False
30
- logprobs: Optional[int] = None
35
+ prompt: Union[List[int], List[List[int]], str, List[str]]
36
+ best_of: Optional[int] = None
31
37
  echo: Optional[bool] = False
32
- stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
33
- presence_penalty: Optional[float] = 0.0
34
38
  frequency_penalty: Optional[float] = 0.0
35
- best_of: Optional[int] = None
36
39
  logit_bias: Optional[Dict[str, float]] = None
40
+ logprobs: Optional[int] = None
41
+ max_tokens: Optional[int] = 16
42
+ n: int = 1
43
+ presence_penalty: Optional[float] = 0.0
44
+ seed: Optional[int] = None
45
+ stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
46
+ stream: Optional[bool] = False
47
+ suffix: Optional[str] = None
48
+ temperature: Optional[float] = 1.0
49
+ top_p: Optional[float] = 1.0
37
50
  user: Optional[str] = None
38
51
 
39
52
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
@@ -107,20 +120,30 @@ ChatCompletionMessageParam = Union[
107
120
  ]
108
121
 
109
122
 
123
+ class ResponseFormat(BaseModel):
124
+ # type must be "json_object" or "text"
125
+ type: Literal["text", "json_object"]
126
+
127
+
110
128
  class ChatCompletionRequest(BaseModel):
129
+ # Ordered by official OpenAI API documentation
130
+ # https://platform.openai.com/docs/api-reference/chat/create
131
+ messages: List[ChatCompletionMessageParam]
111
132
  model: str
112
- messages: Union[str, List[ChatCompletionMessageParam]]
113
- temperature: Optional[float] = 0.7
114
- top_p: Optional[float] = 1.0
115
- n: Optional[int] = 1
133
+ frequency_penalty: Optional[float] = 0.0
134
+ logit_bias: Optional[Dict[str, float]] = None
135
+ logprobs: Optional[bool] = False
136
+ top_logprobs: Optional[int] = None
116
137
  max_tokens: Optional[int] = 16
138
+ n: Optional[int] = 1
139
+ presence_penalty: Optional[float] = 0.0
140
+ response_format: Optional[ResponseFormat] = None
141
+ seed: Optional[int] = None
117
142
  stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
118
143
  stream: Optional[bool] = False
119
- presence_penalty: Optional[float] = 0.0
120
- frequency_penalty: Optional[float] = 0.0
121
- logit_bias: Optional[Dict[str, float]] = None
144
+ temperature: Optional[float] = 0.7
145
+ top_p: Optional[float] = 1.0
122
146
  user: Optional[str] = None
123
- best_of: Optional[int] = None
124
147
 
125
148
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
126
149
  regex: Optional[str] = None
@@ -134,6 +157,7 @@ class ChatMessage(BaseModel):
134
157
  class ChatCompletionResponseChoice(BaseModel):
135
158
  index: int
136
159
  message: ChatMessage
160
+ logprobs: Optional[LogProbs] = None
137
161
  finish_reason: Optional[str] = None
138
162
 
139
163
 
@@ -154,6 +178,7 @@ class DeltaMessage(BaseModel):
154
178
  class ChatCompletionResponseStreamChoice(BaseModel):
155
179
  index: int
156
180
  delta: DeltaMessage
181
+ logprobs: Optional[LogProbs] = None
157
182
  finish_reason: Optional[str] = None
158
183
 
159
184
 
@@ -17,6 +17,7 @@ class SamplingParams:
17
17
  presence_penalty: float = 0.0,
18
18
  ignore_eos: bool = False,
19
19
  skip_special_tokens: bool = True,
20
+ spaces_between_special_tokens: bool = True,
20
21
  dtype: Optional[str] = None,
21
22
  regex: Optional[str] = None,
22
23
  ) -> None:
@@ -29,6 +30,7 @@ class SamplingParams:
29
30
  self.max_new_tokens = max_new_tokens
30
31
  self.ignore_eos = ignore_eos
31
32
  self.skip_special_tokens = skip_special_tokens
33
+ self.spaces_between_special_tokens = spaces_between_special_tokens
32
34
  self.dtype = dtype
33
35
  self.regex = regex
34
36