sglang 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. sglang/__init__.py +33 -26
  2. sglang/api.py +9 -1
  3. sglang/bench_latency.py +2 -2
  4. sglang/bench_serving.py +10 -1
  5. sglang/check_env.py +1 -1
  6. sglang/lang/backend/litellm.py +1 -1
  7. sglang/lang/backend/openai.py +1 -1
  8. sglang/lang/backend/runtime_endpoint.py +4 -4
  9. sglang/lang/interpreter.py +24 -9
  10. sglang/lang/ir.py +1 -1
  11. sglang/srt/constrained/__init__.py +15 -0
  12. sglang/srt/constrained/base_cache.py +15 -0
  13. sglang/srt/constrained/fsm_cache.py +36 -1
  14. sglang/srt/constrained/jump_forward.py +15 -0
  15. sglang/srt/conversation.py +26 -0
  16. sglang/srt/hf_transformers_utils.py +18 -1
  17. sglang/srt/layers/context_flashattention_nopad.py +15 -0
  18. sglang/srt/layers/extend_attention.py +15 -0
  19. sglang/srt/layers/fused_moe.py +15 -0
  20. sglang/srt/layers/linear.py +15 -0
  21. sglang/srt/layers/logits_processor.py +109 -72
  22. sglang/srt/layers/quantization/__init__.py +15 -0
  23. sglang/srt/layers/quantization/fp8.py +15 -0
  24. sglang/srt/layers/radix_attention.py +21 -3
  25. sglang/srt/layers/token_attention.py +16 -1
  26. sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
  27. sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
  28. sglang/srt/managers/detokenizer_manager.py +16 -1
  29. sglang/srt/managers/io_struct.py +38 -5
  30. sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
  31. sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +85 -25
  32. sglang/srt/managers/tokenizer_manager.py +99 -57
  33. sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +177 -81
  34. sglang/srt/mem_cache/flush_cache.py +33 -0
  35. sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
  36. sglang/srt/{managers/controller → mem_cache}/radix_cache.py +15 -0
  37. sglang/srt/mm_utils.py +15 -0
  38. sglang/srt/model_config.py +20 -0
  39. sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +42 -18
  40. sglang/srt/{managers/controller → model_executor}/model_runner.py +51 -16
  41. sglang/srt/model_loader/model_loader.py +15 -0
  42. sglang/srt/model_loader/utils.py +16 -1
  43. sglang/srt/models/chatglm.py +16 -1
  44. sglang/srt/models/commandr.py +16 -1
  45. sglang/srt/models/dbrx.py +16 -1
  46. sglang/srt/models/deepseek.py +16 -1
  47. sglang/srt/models/deepseek_v2.py +532 -0
  48. sglang/srt/models/gemma.py +16 -1
  49. sglang/srt/models/gemma2.py +16 -1
  50. sglang/srt/models/gpt_bigcode.py +16 -1
  51. sglang/srt/models/grok.py +16 -1
  52. sglang/srt/models/internlm2.py +16 -1
  53. sglang/srt/models/llama2.py +16 -1
  54. sglang/srt/models/llama_classification.py +19 -4
  55. sglang/srt/models/llava.py +17 -2
  56. sglang/srt/models/llavavid.py +17 -2
  57. sglang/srt/models/minicpm.py +16 -1
  58. sglang/srt/models/mistral.py +15 -0
  59. sglang/srt/models/mixtral.py +16 -1
  60. sglang/srt/models/mixtral_quant.py +16 -1
  61. sglang/srt/models/qwen.py +16 -1
  62. sglang/srt/models/qwen2.py +16 -1
  63. sglang/srt/models/qwen2_moe.py +16 -1
  64. sglang/srt/models/stablelm.py +16 -1
  65. sglang/srt/models/yivl.py +15 -0
  66. sglang/srt/openai_api/adapter.py +545 -160
  67. sglang/srt/openai_api/protocol.py +65 -1
  68. sglang/srt/sampling_params.py +20 -4
  69. sglang/srt/server.py +90 -37
  70. sglang/srt/server_args.py +76 -17
  71. sglang/srt/utils.py +15 -0
  72. sglang/test/test_programs.py +5 -1
  73. sglang/utils.py +22 -0
  74. sglang/version.py +1 -1
  75. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/METADATA +40 -12
  76. sglang-0.2.7.dist-info/RECORD +93 -0
  77. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/WHEEL +1 -1
  78. sglang/srt/flush_cache.py +0 -18
  79. sglang-0.2.5.dist-info/RECORD +0 -92
  80. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/LICENSE +0 -0
  81. {sglang-0.2.5.dist-info → sglang-0.2.7.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Pydantic models for OpenAI API protocol"""
2
17
 
3
18
  import time
@@ -45,6 +60,55 @@ class UsageInfo(BaseModel):
45
60
  completion_tokens: Optional[int] = 0
46
61
 
47
62
 
63
+ class FileRequest(BaseModel):
64
+ # https://platform.openai.com/docs/api-reference/files/create
65
+ file: bytes # The File object (not file name) to be uploaded
66
+ purpose: str = (
67
+ "batch" # The intended purpose of the uploaded file, default is "batch"
68
+ )
69
+
70
+
71
+ class FileResponse(BaseModel):
72
+ id: str
73
+ object: str = "file"
74
+ bytes: int
75
+ created_at: int
76
+ filename: str
77
+ purpose: str
78
+
79
+
80
+ class BatchRequest(BaseModel):
81
+ input_file_id: (
82
+ str # The ID of an uploaded file that contains requests for the new batch
83
+ )
84
+ endpoint: str # The endpoint to be used for all requests in the batch
85
+ completion_window: str # The time frame within which the batch should be processed
86
+ metadata: Optional[dict] = None # Optional custom metadata for the batch
87
+
88
+
89
+ class BatchResponse(BaseModel):
90
+ id: str
91
+ object: str = "batch"
92
+ endpoint: str
93
+ errors: Optional[dict] = None
94
+ input_file_id: str
95
+ completion_window: str
96
+ status: str = "validating"
97
+ output_file_id: Optional[str] = None
98
+ error_file_id: Optional[str] = None
99
+ created_at: int
100
+ in_progress_at: Optional[int] = None
101
+ expires_at: Optional[int] = None
102
+ finalizing_at: Optional[int] = None
103
+ completed_at: Optional[int] = None
104
+ failed_at: Optional[int] = None
105
+ expired_at: Optional[int] = None
106
+ cancelling_at: Optional[int] = None
107
+ cancelled_at: Optional[int] = None
108
+ request_counts: dict = {"total": 0, "completed": 0, "failed": 0}
109
+ metadata: Optional[dict] = None
110
+
111
+
48
112
  class CompletionRequest(BaseModel):
49
113
  # Ordered by official OpenAI API documentation
50
114
  # https://platform.openai.com/docs/api-reference/completions/create
@@ -152,7 +216,7 @@ class ChatCompletionRequest(BaseModel):
152
216
  logit_bias: Optional[Dict[str, float]] = None
153
217
  logprobs: Optional[bool] = False
154
218
  top_logprobs: Optional[int] = None
155
- max_tokens: Optional[int] = 16
219
+ max_tokens: Optional[int] = None
156
220
  n: Optional[int] = 1
157
221
  presence_penalty: Optional[float] = 0.0
158
222
  response_format: Optional[ResponseFormat] = None
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Sampling parameters for text generation."""
2
17
 
3
18
  from typing import List, Optional, Union
@@ -65,10 +80,11 @@ class SamplingParams:
65
80
  raise ValueError(
66
81
  "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
67
82
  )
68
- if self.max_new_tokens < 0:
69
- raise ValueError(
70
- f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
71
- )
83
+ if self.max_new_tokens is not None:
84
+ if self.max_new_tokens < 0:
85
+ raise ValueError(
86
+ f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
87
+ )
72
88
 
73
89
  def normalize(self, tokenizer):
74
90
  # Process stop strings
sglang/srt/server.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """
2
17
  The entry point of inference server.
3
18
  SRT = SGLang Runtime.
@@ -23,17 +38,17 @@ import psutil
23
38
  import requests
24
39
  import uvicorn
25
40
  import uvloop
26
- from fastapi import FastAPI, Request
41
+ from fastapi import FastAPI, File, Form, Request, UploadFile
27
42
  from fastapi.responses import JSONResponse, Response, StreamingResponse
28
43
 
29
44
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
30
45
  from sglang.srt.constrained import disable_cache
31
46
  from sglang.srt.hf_transformers_utils import get_tokenizer
32
- from sglang.srt.managers.controller.manager_multi import (
47
+ from sglang.srt.managers.controller_multi import (
33
48
  start_controller_process as start_controller_process_multi,
34
49
  )
35
- from sglang.srt.managers.controller.manager_single import launch_tp_servers
36
- from sglang.srt.managers.controller.manager_single import (
50
+ from sglang.srt.managers.controller_single import launch_tp_servers
51
+ from sglang.srt.managers.controller_single import (
37
52
  start_controller_process as start_controller_process_single,
38
53
  )
39
54
  from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
@@ -41,8 +56,13 @@ from sglang.srt.managers.io_struct import GenerateReqInput
41
56
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
42
57
  from sglang.srt.openai_api.adapter import (
43
58
  load_chat_template_for_openai_api,
59
+ v1_batches,
44
60
  v1_chat_completions,
45
61
  v1_completions,
62
+ v1_files_create,
63
+ v1_retrieve_batch,
64
+ v1_retrieve_file,
65
+ v1_retrieve_file_content,
46
66
  )
47
67
  from sglang.srt.openai_api.protocol import ModelCard, ModelList
48
68
  from sglang.srt.server_args import PortArgs, ServerArgs
@@ -65,9 +85,6 @@ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
65
85
  app = FastAPI()
66
86
  tokenizer_manager = None
67
87
 
68
- # Put some args for easily access
69
- global_server_args_dict = {}
70
-
71
88
 
72
89
  @app.get("/health")
73
90
  async def health() -> Response:
@@ -140,6 +157,35 @@ async def openai_v1_chat_completions(raw_request: Request):
140
157
  return await v1_chat_completions(tokenizer_manager, raw_request)
141
158
 
142
159
 
160
+ @app.post("/v1/files")
161
+ async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
162
+ return await v1_files_create(
163
+ file, purpose, tokenizer_manager.server_args.file_storage_pth
164
+ )
165
+
166
+
167
+ @app.post("/v1/batches")
168
+ async def openai_v1_batches(raw_request: Request):
169
+ return await v1_batches(tokenizer_manager, raw_request)
170
+
171
+
172
+ @app.get("/v1/batches/{batch_id}")
173
+ async def retrieve_batch(batch_id: str):
174
+ return await v1_retrieve_batch(batch_id)
175
+
176
+
177
+ @app.get("/v1/files/{file_id}")
178
+ async def retrieve_file(file_id: str):
179
+ # https://platform.openai.com/docs/api-reference/files/retrieve
180
+ return await v1_retrieve_file(file_id)
181
+
182
+
183
+ @app.get("/v1/files/{file_id}/content")
184
+ async def retrieve_file_content(file_id: str):
185
+ # https://platform.openai.com/docs/api-reference/files/retrieve-contents
186
+ return await v1_retrieve_file_content(file_id)
187
+
188
+
143
189
  @app.get("/v1/models")
144
190
  def available_models():
145
191
  """Show available models."""
@@ -150,14 +196,6 @@ def available_models():
150
196
  return ModelList(data=model_cards)
151
197
 
152
198
 
153
- def _set_global_server_args(server_args: ServerArgs):
154
- global global_server_args_dict
155
- global_server_args_dict = {
156
- "disable_flashinfer": server_args.disable_flashinfer,
157
- "attention_reduce_in_fp32": server_args.attention_reduce_in_fp32,
158
- }
159
-
160
-
161
199
  def _set_torch_compile_config():
162
200
  # The following configurations are for torch compile optimizations
163
201
  import torch._dynamo.config
@@ -171,11 +209,46 @@ def _set_torch_compile_config():
171
209
  torch._dynamo.config.accumulated_cache_size_limit = 256
172
210
 
173
211
 
212
+ def set_envs_and_config(server_args: ServerArgs):
213
+ # Set global environments
214
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
215
+ os.environ["NCCL_CUMEM_ENABLE"] = "0"
216
+ os.environ["NCCL_NVLS_ENABLE"] = "0"
217
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
218
+
219
+ # Set ulimit
220
+ set_ulimit()
221
+
222
+ # Enable show time cost for debugging
223
+ if server_args.show_time_cost:
224
+ enable_show_time_cost()
225
+
226
+ # Disable disk cache
227
+ if server_args.disable_disk_cache:
228
+ disable_cache()
229
+
230
+ # Fix triton bugs
231
+ if server_args.tp_size * server_args.dp_size > 1:
232
+ # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
233
+ maybe_set_triton_cache_manager()
234
+
235
+ # Set torch compile config
236
+ if server_args.enable_torch_compile:
237
+ _set_torch_compile_config()
238
+
239
+ # Set global chat template
240
+ if server_args.chat_template:
241
+ # TODO: replace this with huggingface transformers template
242
+ load_chat_template_for_openai_api(server_args.chat_template)
243
+
244
+
174
245
  def launch_server(
175
246
  server_args: ServerArgs,
176
247
  model_overide_args: Optional[dict] = None,
177
248
  pipe_finish_writer: Optional[mp.connection.Connection] = None,
178
249
  ):
250
+ server_args.check_server_args()
251
+
179
252
  """Launch an HTTP server."""
180
253
  global tokenizer_manager
181
254
 
@@ -184,34 +257,16 @@ def launch_server(
184
257
  format="%(message)s",
185
258
  )
186
259
 
187
- # Set global environments
188
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
189
- os.environ["NCCL_CUMEM_ENABLE"] = "0"
190
- os.environ["NCCL_NVLS_ENABLE"] = "0"
191
- os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
192
- set_ulimit()
193
- if server_args.show_time_cost:
194
- enable_show_time_cost()
195
- if server_args.disable_disk_cache:
196
- disable_cache()
197
260
  if not server_args.disable_flashinfer:
198
261
  assert_pkg_version(
199
262
  "flashinfer",
200
- "0.1.1",
263
+ "0.1.2",
201
264
  "Please uninstall the old version and "
202
265
  "reinstall the latest version by following the instructions "
203
266
  "at https://docs.flashinfer.ai/installation.html.",
204
267
  )
205
- if server_args.tp_size * server_args.dp_size > 1:
206
- # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
207
- maybe_set_triton_cache_manager()
208
- if server_args.chat_template:
209
- # TODO: replace this with huggingface transformers template
210
- load_chat_template_for_openai_api(server_args.chat_template)
211
- if server_args.enable_torch_compile:
212
- _set_torch_compile_config()
213
268
 
214
- _set_global_server_args(server_args)
269
+ set_envs_and_config(server_args)
215
270
 
216
271
  # Allocate ports
217
272
  server_args.port, server_args.additional_ports = allocate_init_ports(
@@ -230,8 +285,6 @@ def launch_server(
230
285
 
231
286
  # Handle multi-node tensor parallelism
232
287
  if server_args.nnodes > 1:
233
- assert server_args.dp_size == 1, "Multi-node dp is not supported."
234
-
235
288
  if server_args.node_rank != 0:
236
289
  tp_size_local = server_args.tp_size // server_args.nnodes
237
290
  gpu_ids = [
sglang/srt/server_args.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """The arguments of the server."""
2
17
 
3
18
  import argparse
@@ -28,7 +43,8 @@ class ServerArgs:
28
43
  mem_fraction_static: Optional[float] = None
29
44
  max_prefill_tokens: Optional[int] = None
30
45
  max_running_requests: Optional[int] = None
31
- schedule_heuristic: str = "lpm"
46
+ max_num_reqs: Optional[int] = None
47
+ schedule_policy: str = "lpm"
32
48
  schedule_conservativeness: float = 1.0
33
49
 
34
50
  # Other runtime options
@@ -44,20 +60,25 @@ class ServerArgs:
44
60
 
45
61
  # Other
46
62
  api_key: str = ""
63
+ file_storage_pth: str = "SGlang_storage"
47
64
 
48
65
  # Data parallelism
49
66
  dp_size: int = 1
50
67
  load_balance_method: str = "round_robin"
51
68
 
69
+ # Chunked Prefill
70
+ chunked_prefill_size: Optional[int] = None
71
+
52
72
  # Optimization/debug options
53
73
  disable_flashinfer: bool = False
74
+ disable_flashinfer_sampling: bool = False
54
75
  disable_radix_cache: bool = False
55
76
  disable_regex_jump_forward: bool = False
56
77
  disable_cuda_graph: bool = False
57
78
  disable_disk_cache: bool = False
58
79
  enable_torch_compile: bool = False
59
- attention_reduce_in_fp32: bool = False
60
80
  enable_p2p_check: bool = False
81
+ attention_reduce_in_fp32: bool = False
61
82
  efficient_weight_load: bool = False
62
83
 
63
84
  # Distributed args
@@ -70,15 +91,15 @@ class ServerArgs:
70
91
  self.tokenizer_path = self.model_path
71
92
  if self.mem_fraction_static is None:
72
93
  if self.tp_size >= 16:
73
- self.mem_fraction_static = 0.80
94
+ self.mem_fraction_static = 0.79
74
95
  elif self.tp_size >= 8:
75
- self.mem_fraction_static = 0.84
96
+ self.mem_fraction_static = 0.83
76
97
  elif self.tp_size >= 4:
77
- self.mem_fraction_static = 0.86
98
+ self.mem_fraction_static = 0.85
78
99
  elif self.tp_size >= 2:
79
- self.mem_fraction_static = 0.88
100
+ self.mem_fraction_static = 0.87
80
101
  else:
81
- self.mem_fraction_static = 0.89
102
+ self.mem_fraction_static = 0.88
82
103
  if isinstance(self.additional_ports, int):
83
104
  self.additional_ports = [self.additional_ports]
84
105
  elif self.additional_ports is None:
@@ -174,6 +195,7 @@ class ServerArgs:
174
195
  "gptq",
175
196
  "marlin",
176
197
  "gptq_marlin",
198
+ "awq_marlin",
177
199
  "squeezellm",
178
200
  "bitsandbytes",
179
201
  ],
@@ -204,11 +226,17 @@ class ServerArgs:
204
226
  help="The maximum number of running requests.",
205
227
  )
206
228
  parser.add_argument(
207
- "--schedule-heuristic",
229
+ "--max-num-reqs",
230
+ type=int,
231
+ default=ServerArgs.max_num_reqs,
232
+ help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
233
+ )
234
+ parser.add_argument(
235
+ "--schedule-policy",
208
236
  type=str,
209
- default=ServerArgs.schedule_heuristic,
237
+ default=ServerArgs.schedule_policy,
210
238
  choices=["lpm", "random", "fcfs", "dfs-weight"],
211
- help="The scheduling heuristic.",
239
+ help="The scheduling policy of the requests.",
212
240
  )
213
241
  parser.add_argument(
214
242
  "--schedule-conservativeness",
@@ -262,6 +290,12 @@ class ServerArgs:
262
290
  default=ServerArgs.api_key,
263
291
  help="Set API key of the server.",
264
292
  )
293
+ parser.add_argument(
294
+ "--file-storage-pth",
295
+ type=str,
296
+ default=ServerArgs.file_storage_pth,
297
+ help="The path of the file storage in backend.",
298
+ )
265
299
 
266
300
  # Data parallelism
267
301
  parser.add_argument(
@@ -288,15 +322,28 @@ class ServerArgs:
288
322
  help="The nccl init address of multi-node server.",
289
323
  )
290
324
  parser.add_argument(
291
- "--nnodes", type=int, default=1, help="The number of nodes."
325
+ "--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
292
326
  )
293
327
  parser.add_argument("--node-rank", type=int, help="The node rank.")
294
328
 
329
+ # Chunked prefill
330
+ parser.add_argument(
331
+ "--chunked-prefill-size",
332
+ type=int,
333
+ default=ServerArgs.chunked_prefill_size,
334
+ help="The size of the chunked prefill.",
335
+ )
336
+
295
337
  # Optimization/debug options
296
338
  parser.add_argument(
297
339
  "--disable-flashinfer",
298
340
  action="store_true",
299
- help="Disable flashinfer inference kernels.",
341
+ help="Disable flashinfer attention kernels.",
342
+ )
343
+ parser.add_argument(
344
+ "--disable-flashinfer-sampling",
345
+ action="store_true",
346
+ help="Disable flashinfer sampling kernels.",
300
347
  )
301
348
  parser.add_argument(
302
349
  "--disable-radix-cache",
@@ -324,15 +371,15 @@ class ServerArgs:
324
371
  help="Optimize the model with torch.compile, experimental feature.",
325
372
  )
326
373
  parser.add_argument(
327
- "--attention-reduce-in-fp32",
374
+ "--enable-p2p-check",
328
375
  action="store_true",
329
- help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
330
- "This only affects Triton attention kernels",
376
+ help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
331
377
  )
332
378
  parser.add_argument(
333
- "--enable-p2p-check",
379
+ "--attention-reduce-in-fp32",
334
380
  action="store_true",
335
- help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
381
+ help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
382
+ "This only affects Triton attention kernels",
336
383
  )
337
384
  parser.add_argument(
338
385
  "--efficient-weight-load",
@@ -357,6 +404,18 @@ class ServerArgs:
357
404
  f"disable_disk_cache={self.disable_disk_cache}, "
358
405
  )
359
406
 
407
+ def check_server_args(self):
408
+ assert (
409
+ self.tp_size % self.nnodes == 0
410
+ ), "tp_size must be divisible by number of nodes"
411
+ assert not (
412
+ self.dp_size > 1 and self.node_rank is not None
413
+ ), "multi-node data parallel is not supported"
414
+
415
+ assert not (
416
+ self.chunked_prefill_size is not None and self.disable_radix_cache
417
+ ), "chunked prefill is not supported with radix cache disabled currently"
418
+
360
419
 
361
420
  @dataclasses.dataclass
362
421
  class PortArgs:
sglang/srt/utils.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Common utilities."""
2
17
 
3
18
  import base64
@@ -118,7 +118,11 @@ def test_decode_json_regex():
118
118
  s += "}"
119
119
 
120
120
  ret = decode_json.run()
121
- js_obj = json.loads(ret["json_output"])
121
+ try:
122
+ js_obj = json.loads(ret["json_output"])
123
+ except json.decoder.JSONDecodeError:
124
+ print(ret["json_output"])
125
+ raise
122
126
  assert isinstance(js_obj["name"], str)
123
127
  assert isinstance(js_obj["population"], int)
124
128
 
sglang/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Common utilities."""
2
2
 
3
3
  import base64
4
+ import importlib
4
5
  import json
5
6
  import logging
6
7
  import signal
@@ -261,3 +262,24 @@ def graceful_registry(sub_module_name):
261
262
  logger.info(f"{sub_module_name} recive sigterm")
262
263
 
263
264
  signal.signal(signal.SIGTERM, graceful_shutdown)
265
+
266
+
267
+ class LazyImport:
268
+ def __init__(self, module_name, class_name):
269
+ self.module_name = module_name
270
+ self.class_name = class_name
271
+ self._module = None
272
+
273
+ def _load(self):
274
+ if self._module is None:
275
+ module = importlib.import_module(self.module_name)
276
+ self._module = getattr(module, self.class_name)
277
+ return self._module
278
+
279
+ def __getattr__(self, name):
280
+ module = self._load()
281
+ return getattr(module, name)
282
+
283
+ def __call__(self, *args, **kwargs):
284
+ module = self._load()
285
+ return module(*args, **kwargs)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.5"
1
+ __version__ = "0.2.7"