sglang 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. sglang/__init__.py +3 -1
  2. sglang/api.py +7 -7
  3. sglang/backend/anthropic.py +1 -1
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +158 -11
  6. sglang/backend/runtime_endpoint.py +18 -10
  7. sglang/bench_latency.py +299 -0
  8. sglang/global_config.py +12 -2
  9. sglang/lang/compiler.py +2 -2
  10. sglang/lang/interpreter.py +114 -67
  11. sglang/lang/ir.py +28 -3
  12. sglang/launch_server.py +4 -1
  13. sglang/launch_server_llavavid.py +2 -1
  14. sglang/srt/constrained/__init__.py +13 -6
  15. sglang/srt/constrained/fsm_cache.py +8 -2
  16. sglang/srt/constrained/jump_forward.py +113 -25
  17. sglang/srt/conversation.py +2 -0
  18. sglang/srt/flush_cache.py +3 -1
  19. sglang/srt/hf_transformers_utils.py +130 -1
  20. sglang/srt/layers/extend_attention.py +17 -0
  21. sglang/srt/layers/fused_moe.py +582 -0
  22. sglang/srt/layers/logits_processor.py +65 -32
  23. sglang/srt/layers/radix_attention.py +41 -7
  24. sglang/srt/layers/token_attention.py +16 -1
  25. sglang/srt/managers/controller/dp_worker.py +113 -0
  26. sglang/srt/managers/{router → controller}/infer_batch.py +242 -100
  27. sglang/srt/managers/controller/manager_multi.py +191 -0
  28. sglang/srt/managers/{router/manager.py → controller/manager_single.py} +34 -14
  29. sglang/srt/managers/{router → controller}/model_runner.py +262 -158
  30. sglang/srt/managers/{router → controller}/radix_cache.py +11 -1
  31. sglang/srt/managers/{router/scheduler.py → controller/schedule_heuristic.py} +9 -7
  32. sglang/srt/managers/{router/model_rpc.py → controller/tp_worker.py} +298 -267
  33. sglang/srt/managers/detokenizer_manager.py +42 -46
  34. sglang/srt/managers/io_struct.py +22 -12
  35. sglang/srt/managers/tokenizer_manager.py +151 -87
  36. sglang/srt/model_config.py +83 -5
  37. sglang/srt/models/chatglm.py +399 -0
  38. sglang/srt/models/commandr.py +10 -13
  39. sglang/srt/models/dbrx.py +9 -15
  40. sglang/srt/models/gemma.py +12 -15
  41. sglang/srt/models/grok.py +738 -0
  42. sglang/srt/models/llama2.py +26 -15
  43. sglang/srt/models/llama_classification.py +104 -0
  44. sglang/srt/models/llava.py +86 -19
  45. sglang/srt/models/llavavid.py +11 -20
  46. sglang/srt/models/mixtral.py +282 -103
  47. sglang/srt/models/mixtral_quant.py +372 -0
  48. sglang/srt/models/qwen.py +9 -13
  49. sglang/srt/models/qwen2.py +11 -13
  50. sglang/srt/models/stablelm.py +9 -15
  51. sglang/srt/models/yivl.py +17 -22
  52. sglang/srt/openai_api_adapter.py +150 -95
  53. sglang/srt/openai_protocol.py +11 -2
  54. sglang/srt/server.py +124 -48
  55. sglang/srt/server_args.py +128 -48
  56. sglang/srt/utils.py +234 -67
  57. sglang/test/test_programs.py +65 -3
  58. sglang/test/test_utils.py +32 -1
  59. sglang/utils.py +23 -4
  60. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/METADATA +40 -27
  61. sglang-0.1.18.dist-info/RECORD +78 -0
  62. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/WHEEL +1 -1
  63. sglang/srt/backend_config.py +0 -13
  64. sglang/srt/models/dbrx_config.py +0 -281
  65. sglang/srt/weight_utils.py +0 -417
  66. sglang-0.1.16.dist-info/RECORD +0 -72
  67. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/LICENSE +0 -0
  68. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  import argparse
4
4
  import dataclasses
5
+ import random
5
6
  from typing import List, Optional, Union
6
7
 
7
8
 
@@ -10,11 +11,13 @@ class ServerArgs:
10
11
  # Model and tokenizer
11
12
  model_path: str
12
13
  tokenizer_path: Optional[str] = None
13
- load_format: str = "auto"
14
14
  tokenizer_mode: str = "auto"
15
- chat_template: Optional[str] = None
15
+ load_format: str = "auto"
16
+ dtype: str = "auto"
16
17
  trust_remote_code: bool = True
17
18
  context_length: Optional[int] = None
19
+ quantization: Optional[str] = None
20
+ chat_template: Optional[str] = None
18
21
 
19
22
  # Port
20
23
  host: str = "127.0.0.1"
@@ -23,31 +26,40 @@ class ServerArgs:
23
26
 
24
27
  # Memory and scheduling
25
28
  mem_fraction_static: Optional[float] = None
26
- max_prefill_num_token: Optional[int] = None
29
+ max_prefill_tokens: Optional[int] = None
30
+ max_running_requests: Optional[int] = None
27
31
  schedule_heuristic: str = "lpm"
28
32
  schedule_conservativeness: float = 1.0
29
33
 
30
34
  # Other runtime options
31
35
  tp_size: int = 1
32
36
  stream_interval: int = 8
33
- random_seed: int = 42
37
+ random_seed: Optional[int] = None
34
38
 
35
39
  # Logging
36
40
  log_level: str = "info"
41
+ log_level_http: Optional[str] = None
37
42
  log_requests: bool = False
38
- disable_log_stats: bool = False
39
- log_stats_interval: int = 10
40
43
  show_time_cost: bool = False
41
44
 
42
45
  # Other
43
46
  api_key: str = ""
44
47
 
48
+ # Data parallelism
49
+ dp_size: int = 1
50
+ load_balance_method: str = "round_robin"
51
+
45
52
  # Optimization/debug options
46
- enable_flashinfer: bool = False
47
- attention_reduce_in_fp32: bool = False
53
+ disable_flashinfer: bool = False
48
54
  disable_radix_cache: bool = False
49
55
  disable_regex_jump_forward: bool = False
50
56
  disable_disk_cache: bool = False
57
+ attention_reduce_in_fp32: bool = False
58
+
59
+ # Distributed args
60
+ nccl_init_addr: Optional[str] = None
61
+ nnodes: int = 1
62
+ node_rank: Optional[int] = None
51
63
 
52
64
  def __post_init__(self):
53
65
  if self.tokenizer_path is None:
@@ -66,6 +78,9 @@ class ServerArgs:
66
78
  elif self.additional_ports is None:
67
79
  self.additional_ports = []
68
80
 
81
+ if self.random_seed is None:
82
+ self.random_seed = random.randint(0, 1 << 30)
83
+
69
84
  @staticmethod
70
85
  def add_cli_args(parser: argparse.ArgumentParser):
71
86
  parser.add_argument(
@@ -91,7 +106,16 @@ class ServerArgs:
91
106
  type=int,
92
107
  nargs="*",
93
108
  default=[],
94
- help="Additional ports specified for the server.",
109
+ help="The additional ports specified for the server.",
110
+ )
111
+ parser.add_argument(
112
+ "--tokenizer-mode",
113
+ type=str,
114
+ default=ServerArgs.tokenizer_mode,
115
+ choices=["auto", "slow"],
116
+ help="Tokenizer mode. 'auto' will use the fast "
117
+ "tokenizer if available, and 'slow' will "
118
+ "always use the slow tokenizer.",
95
119
  )
96
120
  parser.add_argument(
97
121
  "--load-format",
@@ -110,20 +134,20 @@ class ServerArgs:
110
134
  "which is mainly for profiling.",
111
135
  )
112
136
  parser.add_argument(
113
- "--tokenizer-mode",
114
- type=str,
115
- default=ServerArgs.tokenizer_mode,
116
- choices=["auto", "slow"],
117
- help="Tokenizer mode. 'auto' will use the fast "
118
- "tokenizer if available, and 'slow' will "
119
- "always use the slow tokenizer.",
120
- )
121
- parser.add_argument(
122
- "--chat-template",
137
+ "--dtype",
123
138
  type=str,
124
- default=ServerArgs.chat_template,
125
- help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server",
126
- )
139
+ default=ServerArgs.dtype,
140
+ choices=[
141
+ "auto", "half", "float16", "bfloat16", "float", "float32"
142
+ ],
143
+ help='Data type for model weights and activations.\n\n'
144
+ '* "auto" will use FP16 precision for FP32 and FP16 models, and '
145
+ 'BF16 precision for BF16 models.\n'
146
+ '* "half" for FP16. Recommended for AWQ quantization.\n'
147
+ '* "float16" is the same as "half".\n'
148
+ '* "bfloat16" for a balance between precision and range.\n'
149
+ '* "float" is shorthand for FP32 precision.\n'
150
+ '* "float32" for FP32 precision.')
127
151
  parser.add_argument(
128
152
  "--trust-remote-code",
129
153
  action="store_true",
@@ -135,6 +159,18 @@ class ServerArgs:
135
159
  default=ServerArgs.context_length,
136
160
  help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
137
161
  )
162
+ parser.add_argument(
163
+ "--quantization",
164
+ type=str,
165
+ default=ServerArgs.quantization,
166
+ help="The quantization method.",
167
+ )
168
+ parser.add_argument(
169
+ "--chat-template",
170
+ type=str,
171
+ default=ServerArgs.chat_template,
172
+ help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
173
+ )
138
174
  parser.add_argument(
139
175
  "--mem-fraction-static",
140
176
  type=float,
@@ -142,17 +178,23 @@ class ServerArgs:
142
178
  help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
143
179
  )
144
180
  parser.add_argument(
145
- "--max-prefill-num-token",
181
+ "--max-prefill-tokens",
146
182
  type=int,
147
- default=ServerArgs.max_prefill_num_token,
183
+ default=ServerArgs.max_prefill_tokens,
148
184
  help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
149
185
  )
186
+ parser.add_argument(
187
+ "--max-running-requests",
188
+ type=int,
189
+ default=ServerArgs.max_running_requests,
190
+ help="The maximum number of running requests.",
191
+ )
150
192
  parser.add_argument(
151
193
  "--schedule-heuristic",
152
194
  type=str,
153
195
  default=ServerArgs.schedule_heuristic,
154
196
  choices=["lpm", "random", "fcfs", "dfs-weight"],
155
- help="Scheduling Heuristic.",
197
+ help="The scheduling heuristic.",
156
198
  )
157
199
  parser.add_argument(
158
200
  "--schedule-conservativeness",
@@ -164,7 +206,7 @@ class ServerArgs:
164
206
  "--tp-size",
165
207
  type=int,
166
208
  default=ServerArgs.tp_size,
167
- help="Tensor parallelism size.",
209
+ help="The tensor parallelism size.",
168
210
  )
169
211
  parser.add_argument(
170
212
  "--stream-interval",
@@ -176,29 +218,24 @@ class ServerArgs:
176
218
  "--random-seed",
177
219
  type=int,
178
220
  default=ServerArgs.random_seed,
179
- help="Random seed.",
221
+ help="The random seed.",
180
222
  )
181
223
  parser.add_argument(
182
224
  "--log-level",
183
225
  type=str,
184
226
  default=ServerArgs.log_level,
185
- help="Logging level",
227
+ help="The logging level of all loggers.",
186
228
  )
187
229
  parser.add_argument(
188
- "--log-requests",
189
- action="store_true",
190
- help="Log all requests",
230
+ "--log-level-http",
231
+ type=str,
232
+ default=ServerArgs.log_level_http,
233
+ help="The logging level of HTTP server. If not set, reuse --log-level by default.",
191
234
  )
192
235
  parser.add_argument(
193
- "--disable-log-stats",
236
+ "--log-requests",
194
237
  action="store_true",
195
- help="Disable logging throughput stats.",
196
- )
197
- parser.add_argument(
198
- "--log-stats-interval",
199
- type=int,
200
- default=ServerArgs.log_stats_interval,
201
- help="Log stats interval in second.",
238
+ help="Log the inputs and outputs of all requests.",
202
239
  )
203
240
  parser.add_argument(
204
241
  "--show-time-cost",
@@ -212,16 +249,47 @@ class ServerArgs:
212
249
  help="Set API key of the server",
213
250
  )
214
251
 
215
- # Optimization/debug options
252
+ # Data parallelism
216
253
  parser.add_argument(
217
- "--enable-flashinfer",
218
- action="store_true",
219
- help="Enable flashinfer inference kernels",
254
+ "--dp-size",
255
+ type=int,
256
+ default=ServerArgs.dp_size,
257
+ help="The data parallelism size.",
220
258
  )
221
259
  parser.add_argument(
222
- "--attention-reduce-in-fp32",
260
+ "--load-balance-method",
261
+ type=str,
262
+ default=ServerArgs.load_balance_method,
263
+ help="The load balancing strategy for data parallelism.",
264
+ choices=[
265
+ "round_robin",
266
+ "shortest_queue",
267
+ ],
268
+ )
269
+
270
+ # Multi-node distributed serving args
271
+ parser.add_argument(
272
+ "--nccl-init-addr",
273
+ type=str,
274
+ help="The nccl init address of multi-node server."
275
+ )
276
+ parser.add_argument(
277
+ "--nnodes",
278
+ type=int,
279
+ default=1,
280
+ help="The number of nodes."
281
+ )
282
+ parser.add_argument(
283
+ "--node-rank",
284
+ type=int,
285
+ help="The node rank."
286
+ )
287
+
288
+ # Optimization/debug options
289
+ parser.add_argument(
290
+ "--disable-flashinfer",
223
291
  action="store_true",
224
- help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
292
+ help="Disable flashinfer inference kernels",
225
293
  )
226
294
  parser.add_argument(
227
295
  "--disable-radix-cache",
@@ -238,6 +306,12 @@ class ServerArgs:
238
306
  action="store_true",
239
307
  help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
240
308
  )
309
+ parser.add_argument(
310
+ "--attention-reduce-in-fp32",
311
+ action="store_true",
312
+ help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
313
+ "This only affects Triton attention kernels",
314
+ )
241
315
 
242
316
  @classmethod
243
317
  def from_cli_args(cls, args: argparse.Namespace):
@@ -249,7 +323,7 @@ class ServerArgs:
249
323
 
250
324
  def print_mode_args(self):
251
325
  return (
252
- f"enable_flashinfer={self.enable_flashinfer}, "
326
+ f"disable_flashinfer={self.disable_flashinfer}, "
253
327
  f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
254
328
  f"disable_radix_cache={self.disable_radix_cache}, "
255
329
  f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
@@ -257,10 +331,16 @@ class ServerArgs:
257
331
  )
258
332
 
259
333
 
334
+ @dataclasses.dataclass
335
+ class ModelPortArgs:
336
+ nccl_port: int
337
+ model_tp_ips: List[str]
338
+ model_tp_ports: List[int]
339
+
340
+
260
341
  @dataclasses.dataclass
261
342
  class PortArgs:
262
343
  tokenizer_port: int
263
344
  router_port: int
264
345
  detokenizer_port: int
265
- nccl_port: int
266
- model_rpc_ports: List[int]
346
+ model_port_args: List[ModelPortArgs]