sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. sglang/__init__.py +59 -2
  2. sglang/api.py +40 -11
  3. sglang/backend/anthropic.py +17 -3
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +160 -12
  6. sglang/backend/runtime_endpoint.py +62 -27
  7. sglang/backend/vertexai.py +1 -0
  8. sglang/bench_latency.py +320 -0
  9. sglang/global_config.py +24 -3
  10. sglang/lang/chat_template.py +122 -6
  11. sglang/lang/compiler.py +2 -2
  12. sglang/lang/interpreter.py +206 -98
  13. sglang/lang/ir.py +98 -34
  14. sglang/lang/tracer.py +6 -4
  15. sglang/launch_server.py +4 -1
  16. sglang/launch_server_llavavid.py +32 -0
  17. sglang/srt/constrained/__init__.py +14 -6
  18. sglang/srt/constrained/fsm_cache.py +9 -2
  19. sglang/srt/constrained/jump_forward.py +113 -24
  20. sglang/srt/conversation.py +4 -2
  21. sglang/srt/flush_cache.py +18 -0
  22. sglang/srt/hf_transformers_utils.py +144 -3
  23. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  24. sglang/srt/layers/extend_attention.py +20 -1
  25. sglang/srt/layers/fused_moe.py +596 -0
  26. sglang/srt/layers/logits_processor.py +190 -61
  27. sglang/srt/layers/radix_attention.py +62 -53
  28. sglang/srt/layers/token_attention.py +21 -9
  29. sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
  30. sglang/srt/managers/controller/dp_worker.py +113 -0
  31. sglang/srt/managers/controller/infer_batch.py +908 -0
  32. sglang/srt/managers/controller/manager_multi.py +195 -0
  33. sglang/srt/managers/controller/manager_single.py +177 -0
  34. sglang/srt/managers/controller/model_runner.py +359 -0
  35. sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
  36. sglang/srt/managers/controller/schedule_heuristic.py +65 -0
  37. sglang/srt/managers/controller/tp_worker.py +813 -0
  38. sglang/srt/managers/detokenizer_manager.py +42 -40
  39. sglang/srt/managers/io_struct.py +44 -10
  40. sglang/srt/managers/tokenizer_manager.py +224 -82
  41. sglang/srt/memory_pool.py +52 -59
  42. sglang/srt/model_config.py +97 -2
  43. sglang/srt/models/chatglm.py +399 -0
  44. sglang/srt/models/commandr.py +369 -0
  45. sglang/srt/models/dbrx.py +406 -0
  46. sglang/srt/models/gemma.py +34 -38
  47. sglang/srt/models/gemma2.py +436 -0
  48. sglang/srt/models/grok.py +738 -0
  49. sglang/srt/models/llama2.py +47 -37
  50. sglang/srt/models/llama_classification.py +107 -0
  51. sglang/srt/models/llava.py +92 -27
  52. sglang/srt/models/llavavid.py +298 -0
  53. sglang/srt/models/minicpm.py +366 -0
  54. sglang/srt/models/mixtral.py +302 -127
  55. sglang/srt/models/mixtral_quant.py +372 -0
  56. sglang/srt/models/qwen.py +40 -35
  57. sglang/srt/models/qwen2.py +33 -36
  58. sglang/srt/models/qwen2_moe.py +473 -0
  59. sglang/srt/models/stablelm.py +33 -39
  60. sglang/srt/models/yivl.py +19 -26
  61. sglang/srt/openai_api_adapter.py +411 -0
  62. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
  63. sglang/srt/sampling_params.py +2 -0
  64. sglang/srt/server.py +197 -481
  65. sglang/srt/server_args.py +190 -74
  66. sglang/srt/utils.py +460 -95
  67. sglang/test/test_programs.py +73 -10
  68. sglang/test/test_utils.py +226 -7
  69. sglang/utils.py +97 -27
  70. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
  71. sglang-0.1.21.dist-info/RECORD +82 -0
  72. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
  73. sglang/srt/backend_config.py +0 -13
  74. sglang/srt/managers/router/infer_batch.py +0 -503
  75. sglang/srt/managers/router/manager.py +0 -79
  76. sglang/srt/managers/router/model_rpc.py +0 -686
  77. sglang/srt/managers/router/model_runner.py +0 -514
  78. sglang/srt/managers/router/scheduler.py +0 -70
  79. sglang-0.1.14.dist-info/RECORD +0 -64
  80. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
  81. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -1,56 +1,90 @@
1
+ """The arguments of the server."""
2
+
1
3
  import argparse
2
4
  import dataclasses
5
+ import random
3
6
  from typing import List, Optional, Union
4
7
 
5
8
 
6
9
  @dataclasses.dataclass
7
10
  class ServerArgs:
11
+ # Model and tokenizer
8
12
  model_path: str
9
13
  tokenizer_path: Optional[str] = None
14
+ tokenizer_mode: str = "auto"
15
+ load_format: str = "auto"
16
+ dtype: str = "auto"
17
+ trust_remote_code: bool = True
18
+ context_length: Optional[int] = None
19
+ quantization: Optional[str] = None
20
+ chat_template: Optional[str] = None
21
+
22
+ # Port
10
23
  host: str = "127.0.0.1"
11
24
  port: int = 30000
12
25
  additional_ports: Optional[Union[List[int], int]] = None
13
- load_format: str = "auto"
14
- tokenizer_mode: str = "auto"
15
- chat_template: Optional[str] = None
16
- trust_remote_code: bool = True
26
+
27
+ # Memory and scheduling
17
28
  mem_fraction_static: Optional[float] = None
18
- max_prefill_num_token: Optional[int] = None
19
- context_length: Optional[int] = None
20
- tp_size: int = 1
29
+ max_prefill_tokens: Optional[int] = None
30
+ max_running_requests: Optional[int] = None
21
31
  schedule_heuristic: str = "lpm"
22
- schedule_conservativeness: float = 1.0
23
- attention_reduce_in_fp32: bool = False
24
- random_seed: int = 42
32
+ schedule_conservativeness: float = 0.8
33
+
34
+ # Other runtime options
35
+ tp_size: int = 1
25
36
  stream_interval: int = 8
26
- disable_log_stats: bool = False
27
- log_stats_interval: int = 10
37
+ random_seed: Optional[int] = None
38
+
39
+ # Logging
28
40
  log_level: str = "info"
41
+ log_level_http: Optional[str] = None
42
+ log_requests: bool = False
43
+ show_time_cost: bool = False
44
+
45
+ # Other
46
+ api_key: str = ""
29
47
 
30
- # optional modes
48
+ # Data parallelism
49
+ dp_size: int = 1
50
+ load_balance_method: str = "round_robin"
51
+
52
+ # Optimization/debug options
53
+ disable_flashinfer: bool = False
31
54
  disable_radix_cache: bool = False
32
- enable_flashinfer: bool = False
33
55
  disable_regex_jump_forward: bool = False
56
+ disable_cuda_graph: bool = False
34
57
  disable_disk_cache: bool = False
35
- api_key: str = ""
58
+ attention_reduce_in_fp32: bool = False
59
+ enable_p2p_check: bool = False
60
+
61
+ # Distributed args
62
+ nccl_init_addr: Optional[str] = None
63
+ nnodes: int = 1
64
+ node_rank: Optional[int] = None
36
65
 
37
66
  def __post_init__(self):
38
67
  if self.tokenizer_path is None:
39
68
  self.tokenizer_path = self.model_path
40
69
  if self.mem_fraction_static is None:
41
- if self.tp_size >= 8:
42
- self.mem_fraction_static = 0.80
70
+ if self.tp_size >= 16:
71
+ self.mem_fraction_static = 0.74
72
+ elif self.tp_size >= 8:
73
+ self.mem_fraction_static = 0.78
43
74
  elif self.tp_size >= 4:
44
75
  self.mem_fraction_static = 0.82
45
76
  elif self.tp_size >= 2:
46
77
  self.mem_fraction_static = 0.85
47
78
  else:
48
- self.mem_fraction_static = 0.90
79
+ self.mem_fraction_static = 0.88
49
80
  if isinstance(self.additional_ports, int):
50
81
  self.additional_ports = [self.additional_ports]
51
82
  elif self.additional_ports is None:
52
83
  self.additional_ports = []
53
84
 
85
+ if self.random_seed is None:
86
+ self.random_seed = random.randint(0, 1 << 30)
87
+
54
88
  @staticmethod
55
89
  def add_cli_args(parser: argparse.ArgumentParser):
56
90
  parser.add_argument(
@@ -65,15 +99,27 @@ class ServerArgs:
65
99
  default=ServerArgs.tokenizer_path,
66
100
  help="The path of the tokenizer.",
67
101
  )
68
- parser.add_argument("--host", type=str, default=ServerArgs.host)
69
- parser.add_argument("--port", type=int, default=ServerArgs.port)
70
- # we want to be able to pass a list of ports
102
+ parser.add_argument(
103
+ "--host", type=str, default=ServerArgs.host, help="The host of the server."
104
+ )
105
+ parser.add_argument(
106
+ "--port", type=int, default=ServerArgs.port, help="The port of the server."
107
+ )
71
108
  parser.add_argument(
72
109
  "--additional-ports",
73
110
  type=int,
74
111
  nargs="*",
75
112
  default=[],
76
- help="Additional ports specified for launching server.",
113
+ help="The additional ports specified for the server.",
114
+ )
115
+ parser.add_argument(
116
+ "--tokenizer-mode",
117
+ type=str,
118
+ default=ServerArgs.tokenizer_mode,
119
+ choices=["auto", "slow"],
120
+ help="Tokenizer mode. 'auto' will use the fast "
121
+ "tokenizer if available, and 'slow' will "
122
+ "always use the slow tokenizer.",
77
123
  )
78
124
  parser.add_argument(
79
125
  "--load-format",
@@ -92,25 +138,42 @@ class ServerArgs:
92
138
  "which is mainly for profiling.",
93
139
  )
94
140
  parser.add_argument(
95
- "--tokenizer-mode",
141
+ "--dtype",
96
142
  type=str,
97
- default=ServerArgs.tokenizer_mode,
98
- choices=["auto", "slow"],
99
- help="Tokenizer mode. 'auto' will use the fast "
100
- "tokenizer if available, and 'slow' will "
101
- "always use the slow tokenizer.",
102
- )
103
- parser.add_argument(
104
- "--chat-template",
105
- type=str,
106
- default=ServerArgs.chat_template,
107
- help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server",
143
+ default=ServerArgs.dtype,
144
+ choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
145
+ help="Data type for model weights and activations.\n\n"
146
+ '* "auto" will use FP16 precision for FP32 and FP16 models, and '
147
+ "BF16 precision for BF16 models.\n"
148
+ '* "half" for FP16. Recommended for AWQ quantization.\n'
149
+ '* "float16" is the same as "half".\n'
150
+ '* "bfloat16" for a balance between precision and range.\n'
151
+ '* "float" is shorthand for FP32 precision.\n'
152
+ '* "float32" for FP32 precision.',
108
153
  )
109
154
  parser.add_argument(
110
155
  "--trust-remote-code",
111
156
  action="store_true",
112
157
  help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
113
158
  )
159
+ parser.add_argument(
160
+ "--context-length",
161
+ type=int,
162
+ default=ServerArgs.context_length,
163
+ help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
164
+ )
165
+ parser.add_argument(
166
+ "--quantization",
167
+ type=str,
168
+ default=ServerArgs.quantization,
169
+ help="The quantization method.",
170
+ )
171
+ parser.add_argument(
172
+ "--chat-template",
173
+ type=str,
174
+ default=ServerArgs.chat_template,
175
+ help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
176
+ )
114
177
  parser.add_argument(
115
178
  "--mem-fraction-static",
116
179
  type=float,
@@ -118,28 +181,23 @@ class ServerArgs:
118
181
  help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
119
182
  )
120
183
  parser.add_argument(
121
- "--max-prefill-num-token",
184
+ "--max-prefill-tokens",
122
185
  type=int,
123
- default=ServerArgs.max_prefill_num_token,
186
+ default=ServerArgs.max_prefill_tokens,
124
187
  help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
125
188
  )
126
189
  parser.add_argument(
127
- "--context-length",
128
- type=int,
129
- default=ServerArgs.context_length,
130
- help="The model's maximum context length. Use this to reduce the context length to save memory. Defaults to None (will use the value from the model's config.json instead).",
131
- )
132
- parser.add_argument(
133
- "--tp-size",
190
+ "--max-running-requests",
134
191
  type=int,
135
- default=ServerArgs.tp_size,
136
- help="Tensor parallelism degree.",
192
+ default=ServerArgs.max_running_requests,
193
+ help="The maximum number of running requests.",
137
194
  )
138
195
  parser.add_argument(
139
196
  "--schedule-heuristic",
140
197
  type=str,
141
198
  default=ServerArgs.schedule_heuristic,
142
- help="Schudule mode: [lpm, weight, random, fcfs]",
199
+ choices=["lpm", "random", "fcfs", "dfs-weight"],
200
+ help="The scheduling heuristic.",
143
201
  )
144
202
  parser.add_argument(
145
203
  "--schedule-conservativeness",
@@ -148,15 +206,10 @@ class ServerArgs:
148
206
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
149
207
  )
150
208
  parser.add_argument(
151
- "--random-seed",
209
+ "--tp-size",
152
210
  type=int,
153
- default=ServerArgs.random_seed,
154
- help="Random seed.",
155
- )
156
- parser.add_argument(
157
- "--attention-reduce-in-fp32",
158
- action="store_true",
159
- help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16.",
211
+ default=ServerArgs.tp_size,
212
+ help="The tensor parallelism size.",
160
213
  )
161
214
  parser.add_argument(
162
215
  "--stream-interval",
@@ -164,49 +217,106 @@ class ServerArgs:
164
217
  default=ServerArgs.stream_interval,
165
218
  help="The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher",
166
219
  )
220
+ parser.add_argument(
221
+ "--random-seed",
222
+ type=int,
223
+ default=ServerArgs.random_seed,
224
+ help="The random seed.",
225
+ )
167
226
  parser.add_argument(
168
227
  "--log-level",
169
228
  type=str,
170
229
  default=ServerArgs.log_level,
171
- help="Log level",
230
+ help="The logging level of all loggers.",
172
231
  )
173
232
  parser.add_argument(
174
- "--disable-log-stats",
233
+ "--log-level-http",
234
+ type=str,
235
+ default=ServerArgs.log_level_http,
236
+ help="The logging level of HTTP server. If not set, reuse --log-level by default.",
237
+ )
238
+ parser.add_argument(
239
+ "--log-requests",
175
240
  action="store_true",
176
- help="Disable logging throughput stats.",
241
+ help="Log the inputs and outputs of all requests.",
242
+ )
243
+ parser.add_argument(
244
+ "--show-time-cost",
245
+ action="store_true",
246
+ help="Show time cost of custom marks",
247
+ )
248
+ parser.add_argument(
249
+ "--api-key",
250
+ type=str,
251
+ default=ServerArgs.api_key,
252
+ help="Set API key of the server",
177
253
  )
254
+
255
+ # Data parallelism
178
256
  parser.add_argument(
179
- "--log-stats-interval",
257
+ "--dp-size",
180
258
  type=int,
181
- default=ServerArgs.log_stats_interval,
182
- help="Log stats interval in second.",
259
+ default=ServerArgs.dp_size,
260
+ help="The data parallelism size.",
183
261
  )
184
- # optional modes
185
262
  parser.add_argument(
186
- "--disable-radix-cache",
263
+ "--load-balance-method",
264
+ type=str,
265
+ default=ServerArgs.load_balance_method,
266
+ help="The load balancing strategy for data parallelism.",
267
+ choices=[
268
+ "round_robin",
269
+ "shortest_queue",
270
+ ],
271
+ )
272
+
273
+ # Multi-node distributed serving args
274
+ parser.add_argument(
275
+ "--nccl-init-addr",
276
+ type=str,
277
+ help="The nccl init address of multi-node server.",
278
+ )
279
+ parser.add_argument(
280
+ "--nnodes", type=int, default=1, help="The number of nodes."
281
+ )
282
+ parser.add_argument("--node-rank", type=int, help="The node rank.")
283
+
284
+ # Optimization/debug options
285
+ parser.add_argument(
286
+ "--disable-flashinfer",
187
287
  action="store_true",
188
- help="Disable RadixAttention",
288
+ help="Disable flashinfer inference kernels",
189
289
  )
190
290
  parser.add_argument(
191
- "--enable-flashinfer",
291
+ "--disable-radix-cache",
192
292
  action="store_true",
193
- help="Enable flashinfer inference kernels",
293
+ help="Disable RadixAttention",
194
294
  )
195
295
  parser.add_argument(
196
296
  "--disable-regex-jump-forward",
197
297
  action="store_true",
198
298
  help="Disable regex jump-forward",
199
299
  )
300
+ parser.add_argument(
301
+ "--disable-cuda-graph",
302
+ action="store_true",
303
+ help="Disable cuda graph.",
304
+ )
200
305
  parser.add_argument(
201
306
  "--disable-disk-cache",
202
307
  action="store_true",
203
308
  help="Disable disk cache to avoid possible crashes related to file system or high concurrency.",
204
309
  )
205
310
  parser.add_argument(
206
- "--api-key",
207
- type=str,
208
- default=ServerArgs.api_key,
209
- help="Set API Key",
311
+ "--attention-reduce-in-fp32",
312
+ action="store_true",
313
+ help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
314
+ "This only affects Triton attention kernels",
315
+ )
316
+ parser.add_argument(
317
+ "--enable-p2p-check",
318
+ action="store_true",
319
+ help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
210
320
  )
211
321
 
212
322
  @classmethod
@@ -217,20 +327,26 @@ class ServerArgs:
217
327
  def url(self):
218
328
  return f"http://{self.host}:{self.port}"
219
329
 
220
- def get_optional_modes_logging(self):
330
+ def print_mode_args(self):
221
331
  return (
332
+ f"disable_flashinfer={self.disable_flashinfer}, "
333
+ f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}, "
222
334
  f"disable_radix_cache={self.disable_radix_cache}, "
223
- f"enable_flashinfer={self.enable_flashinfer}, "
224
335
  f"disable_regex_jump_forward={self.disable_regex_jump_forward}, "
225
336
  f"disable_disk_cache={self.disable_disk_cache}, "
226
- f"attention_reduce_in_fp32={self.attention_reduce_in_fp32}"
227
337
  )
228
338
 
229
339
 
340
+ @dataclasses.dataclass
341
+ class ModelPortArgs:
342
+ nccl_port: int
343
+ model_tp_ips: List[str]
344
+ model_tp_ports: List[int]
345
+
346
+
230
347
  @dataclasses.dataclass
231
348
  class PortArgs:
232
349
  tokenizer_port: int
233
350
  router_port: int
234
351
  detokenizer_port: int
235
- nccl_port: int
236
- model_rpc_ports: List[int]
352
+ model_port_args: List[ModelPortArgs]