sglang 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. sglang/__init__.py +33 -26
  2. sglang/api.py +9 -1
  3. sglang/bench_latency.py +2 -2
  4. sglang/bench_serving.py +10 -1
  5. sglang/check_env.py +1 -1
  6. sglang/lang/backend/litellm.py +1 -1
  7. sglang/lang/backend/openai.py +1 -1
  8. sglang/lang/interpreter.py +21 -5
  9. sglang/lang/ir.py +1 -2
  10. sglang/srt/constrained/__init__.py +15 -0
  11. sglang/srt/constrained/{base_cache.py → base_tool_cache.py} +17 -2
  12. sglang/srt/constrained/fsm_cache.py +17 -2
  13. sglang/srt/constrained/jump_forward.py +17 -2
  14. sglang/srt/conversation.py +26 -0
  15. sglang/srt/hf_transformers_utils.py +15 -0
  16. sglang/srt/layers/context_flashattention_nopad.py +15 -0
  17. sglang/srt/layers/extend_attention.py +15 -0
  18. sglang/srt/layers/fused_moe.py +15 -0
  19. sglang/srt/layers/linear.py +15 -0
  20. sglang/srt/layers/logits_processor.py +41 -13
  21. sglang/srt/layers/quantization/__init__.py +15 -0
  22. sglang/srt/layers/quantization/fp8.py +15 -0
  23. sglang/srt/layers/radix_attention.py +17 -2
  24. sglang/srt/layers/token_attention.py +16 -1
  25. sglang/srt/managers/{controller/manager_multi.py → controller_multi.py} +17 -2
  26. sglang/srt/managers/{controller/manager_single.py → controller_single.py} +17 -2
  27. sglang/srt/managers/detokenizer_manager.py +16 -1
  28. sglang/srt/managers/io_struct.py +36 -3
  29. sglang/srt/managers/{controller/schedule_heuristic.py → policy_scheduler.py} +37 -22
  30. sglang/srt/managers/{controller/infer_batch.py → schedule_batch.py} +60 -21
  31. sglang/srt/managers/tokenizer_manager.py +39 -16
  32. sglang/srt/managers/{controller/tp_worker.py → tp_worker.py} +159 -46
  33. sglang/srt/mem_cache/base_cache.py +43 -0
  34. sglang/srt/mem_cache/chunk_cache.py +60 -0
  35. sglang/srt/mem_cache/flush_cache.py +33 -0
  36. sglang/srt/{memory_pool.py → mem_cache/memory_pool.py} +16 -1
  37. sglang/srt/{managers/controller → mem_cache}/radix_cache.py +20 -2
  38. sglang/srt/mm_utils.py +15 -0
  39. sglang/srt/model_config.py +15 -0
  40. sglang/srt/{managers/controller → model_executor}/cuda_graph_runner.py +16 -1
  41. sglang/srt/{managers/controller → model_executor}/model_runner.py +49 -14
  42. sglang/srt/model_loader/model_loader.py +15 -0
  43. sglang/srt/model_loader/utils.py +16 -1
  44. sglang/srt/models/chatglm.py +16 -1
  45. sglang/srt/models/commandr.py +16 -1
  46. sglang/srt/models/dbrx.py +16 -1
  47. sglang/srt/models/deepseek.py +16 -1
  48. sglang/srt/models/deepseek_v2.py +16 -1
  49. sglang/srt/models/gemma.py +16 -1
  50. sglang/srt/models/gemma2.py +16 -1
  51. sglang/srt/models/gpt_bigcode.py +16 -1
  52. sglang/srt/models/grok.py +16 -1
  53. sglang/srt/models/internlm2.py +16 -1
  54. sglang/srt/models/llama2.py +21 -22
  55. sglang/srt/models/llama_classification.py +16 -1
  56. sglang/srt/models/llava.py +17 -2
  57. sglang/srt/models/llavavid.py +17 -2
  58. sglang/srt/models/minicpm.py +16 -1
  59. sglang/srt/models/mistral.py +15 -0
  60. sglang/srt/models/mixtral.py +16 -1
  61. sglang/srt/models/mixtral_quant.py +16 -1
  62. sglang/srt/models/qwen.py +16 -1
  63. sglang/srt/models/qwen2.py +16 -1
  64. sglang/srt/models/qwen2_moe.py +16 -1
  65. sglang/srt/models/stablelm.py +16 -1
  66. sglang/srt/models/yivl.py +15 -0
  67. sglang/srt/openai_api/adapter.py +569 -131
  68. sglang/srt/openai_api/protocol.py +84 -2
  69. sglang/srt/sampling_params.py +15 -0
  70. sglang/srt/server.py +92 -23
  71. sglang/srt/server_args.py +52 -11
  72. sglang/srt/utils.py +15 -0
  73. sglang/test/test_programs.py +9 -6
  74. sglang/utils.py +22 -0
  75. sglang/version.py +1 -1
  76. {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/METADATA +33 -7
  77. sglang-0.2.8.dist-info/RECORD +95 -0
  78. {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/WHEEL +1 -1
  79. sglang/srt/flush_cache.py +0 -18
  80. sglang-0.2.6.dist-info/RECORD +0 -93
  81. {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/LICENSE +0 -0
  82. {sglang-0.2.6.dist-info → sglang-0.2.8.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Pydantic models for OpenAI API protocol"""
2
17
 
3
18
  import time
@@ -39,12 +54,79 @@ class LogProbs(BaseModel):
39
54
  top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
40
55
 
41
56
 
57
+ class TopLogprob(BaseModel):
58
+ token: str
59
+ bytes: List[int]
60
+ logprob: float
61
+
62
+
63
+ class ChatCompletionTokenLogprob(BaseModel):
64
+ token: str
65
+ bytes: List[int]
66
+ logprob: float
67
+ top_logprobs: List[TopLogprob]
68
+
69
+
70
+ class ChoiceLogprobs(BaseModel):
71
+ # build for v1/chat/completions response
72
+ content: List[ChatCompletionTokenLogprob]
73
+
74
+
42
75
  class UsageInfo(BaseModel):
43
76
  prompt_tokens: int = 0
44
77
  total_tokens: int = 0
45
78
  completion_tokens: Optional[int] = 0
46
79
 
47
80
 
81
+ class FileRequest(BaseModel):
82
+ # https://platform.openai.com/docs/api-reference/files/create
83
+ file: bytes # The File object (not file name) to be uploaded
84
+ purpose: str = (
85
+ "batch" # The intended purpose of the uploaded file, default is "batch"
86
+ )
87
+
88
+
89
+ class FileResponse(BaseModel):
90
+ id: str
91
+ object: str = "file"
92
+ bytes: int
93
+ created_at: int
94
+ filename: str
95
+ purpose: str
96
+
97
+
98
+ class BatchRequest(BaseModel):
99
+ input_file_id: (
100
+ str # The ID of an uploaded file that contains requests for the new batch
101
+ )
102
+ endpoint: str # The endpoint to be used for all requests in the batch
103
+ completion_window: str # The time frame within which the batch should be processed
104
+ metadata: Optional[dict] = None # Optional custom metadata for the batch
105
+
106
+
107
+ class BatchResponse(BaseModel):
108
+ id: str
109
+ object: str = "batch"
110
+ endpoint: str
111
+ errors: Optional[dict] = None
112
+ input_file_id: str
113
+ completion_window: str
114
+ status: str = "validating"
115
+ output_file_id: Optional[str] = None
116
+ error_file_id: Optional[str] = None
117
+ created_at: int
118
+ in_progress_at: Optional[int] = None
119
+ expires_at: Optional[int] = None
120
+ finalizing_at: Optional[int] = None
121
+ completed_at: Optional[int] = None
122
+ failed_at: Optional[int] = None
123
+ expired_at: Optional[int] = None
124
+ cancelling_at: Optional[int] = None
125
+ cancelled_at: Optional[int] = None
126
+ request_counts: dict = {"total": 0, "completed": 0, "failed": 0}
127
+ metadata: Optional[dict] = None
128
+
129
+
48
130
  class CompletionRequest(BaseModel):
49
131
  # Ordered by official OpenAI API documentation
50
132
  # https://platform.openai.com/docs/api-reference/completions/create
@@ -175,8 +257,8 @@ class ChatMessage(BaseModel):
175
257
  class ChatCompletionResponseChoice(BaseModel):
176
258
  index: int
177
259
  message: ChatMessage
178
- logprobs: Optional[LogProbs] = None
179
- finish_reason: Optional[str] = None
260
+ logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
261
+ finish_reason: str
180
262
 
181
263
 
182
264
  class ChatCompletionResponse(BaseModel):
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Sampling parameters for text generation."""
2
17
 
3
18
  from typing import List, Optional, Union
sglang/srt/server.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """
2
17
  The entry point of inference server.
3
18
  SRT = SGLang Runtime.
@@ -23,17 +38,17 @@ import psutil
23
38
  import requests
24
39
  import uvicorn
25
40
  import uvloop
26
- from fastapi import FastAPI, Request
41
+ from fastapi import FastAPI, File, Form, Request, UploadFile
27
42
  from fastapi.responses import JSONResponse, Response, StreamingResponse
28
43
 
29
44
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
30
45
  from sglang.srt.constrained import disable_cache
31
46
  from sglang.srt.hf_transformers_utils import get_tokenizer
32
- from sglang.srt.managers.controller.manager_multi import (
47
+ from sglang.srt.managers.controller_multi import (
33
48
  start_controller_process as start_controller_process_multi,
34
49
  )
35
- from sglang.srt.managers.controller.manager_single import launch_tp_servers
36
- from sglang.srt.managers.controller.manager_single import (
50
+ from sglang.srt.managers.controller_single import launch_tp_servers
51
+ from sglang.srt.managers.controller_single import (
37
52
  start_controller_process as start_controller_process_single,
38
53
  )
39
54
  from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
@@ -41,8 +56,13 @@ from sglang.srt.managers.io_struct import GenerateReqInput
41
56
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
42
57
  from sglang.srt.openai_api.adapter import (
43
58
  load_chat_template_for_openai_api,
59
+ v1_batches,
44
60
  v1_chat_completions,
45
61
  v1_completions,
62
+ v1_files_create,
63
+ v1_retrieve_batch,
64
+ v1_retrieve_file,
65
+ v1_retrieve_file_content,
46
66
  )
47
67
  from sglang.srt.openai_api.protocol import ModelCard, ModelList
48
68
  from sglang.srt.server_args import PortArgs, ServerArgs
@@ -137,6 +157,35 @@ async def openai_v1_chat_completions(raw_request: Request):
137
157
  return await v1_chat_completions(tokenizer_manager, raw_request)
138
158
 
139
159
 
160
+ @app.post("/v1/files")
161
+ async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
162
+ return await v1_files_create(
163
+ file, purpose, tokenizer_manager.server_args.file_storage_pth
164
+ )
165
+
166
+
167
+ @app.post("/v1/batches")
168
+ async def openai_v1_batches(raw_request: Request):
169
+ return await v1_batches(tokenizer_manager, raw_request)
170
+
171
+
172
+ @app.get("/v1/batches/{batch_id}")
173
+ async def retrieve_batch(batch_id: str):
174
+ return await v1_retrieve_batch(batch_id)
175
+
176
+
177
+ @app.get("/v1/files/{file_id}")
178
+ async def retrieve_file(file_id: str):
179
+ # https://platform.openai.com/docs/api-reference/files/retrieve
180
+ return await v1_retrieve_file(file_id)
181
+
182
+
183
+ @app.get("/v1/files/{file_id}/content")
184
+ async def retrieve_file_content(file_id: str):
185
+ # https://platform.openai.com/docs/api-reference/files/retrieve-contents
186
+ return await v1_retrieve_file_content(file_id)
187
+
188
+
140
189
  @app.get("/v1/models")
141
190
  def available_models():
142
191
  """Show available models."""
@@ -160,6 +209,39 @@ def _set_torch_compile_config():
160
209
  torch._dynamo.config.accumulated_cache_size_limit = 256
161
210
 
162
211
 
212
+ def set_envs_and_config(server_args: ServerArgs):
213
+ # Set global environments
214
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
215
+ os.environ["NCCL_CUMEM_ENABLE"] = "0"
216
+ os.environ["NCCL_NVLS_ENABLE"] = "0"
217
+ os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
218
+
219
+ # Set ulimit
220
+ set_ulimit()
221
+
222
+ # Enable show time cost for debugging
223
+ if server_args.show_time_cost:
224
+ enable_show_time_cost()
225
+
226
+ # Disable disk cache
227
+ if server_args.disable_disk_cache:
228
+ disable_cache()
229
+
230
+ # Fix triton bugs
231
+ if server_args.tp_size * server_args.dp_size > 1:
232
+ # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
233
+ maybe_set_triton_cache_manager()
234
+
235
+ # Set torch compile config
236
+ if server_args.enable_torch_compile:
237
+ _set_torch_compile_config()
238
+
239
+ # Set global chat template
240
+ if server_args.chat_template:
241
+ # TODO: replace this with huggingface transformers template
242
+ load_chat_template_for_openai_api(server_args.chat_template)
243
+
244
+
163
245
  def launch_server(
164
246
  server_args: ServerArgs,
165
247
  model_overide_args: Optional[dict] = None,
@@ -175,32 +257,16 @@ def launch_server(
175
257
  format="%(message)s",
176
258
  )
177
259
 
178
- # Set global environments
179
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
180
- os.environ["NCCL_CUMEM_ENABLE"] = "0"
181
- os.environ["NCCL_NVLS_ENABLE"] = "0"
182
- os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
183
- set_ulimit()
184
- if server_args.show_time_cost:
185
- enable_show_time_cost()
186
- if server_args.disable_disk_cache:
187
- disable_cache()
188
260
  if not server_args.disable_flashinfer:
189
261
  assert_pkg_version(
190
262
  "flashinfer",
191
- "0.1.1",
263
+ "0.1.3",
192
264
  "Please uninstall the old version and "
193
265
  "reinstall the latest version by following the instructions "
194
266
  "at https://docs.flashinfer.ai/installation.html.",
195
267
  )
196
- if server_args.tp_size * server_args.dp_size > 1:
197
- # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
198
- maybe_set_triton_cache_manager()
199
- if server_args.chat_template:
200
- # TODO: replace this with huggingface transformers template
201
- load_chat_template_for_openai_api(server_args.chat_template)
202
- if server_args.enable_torch_compile:
203
- _set_torch_compile_config()
268
+
269
+ set_envs_and_config(server_args)
204
270
 
205
271
  # Allocate ports
206
272
  server_args.port, server_args.additional_ports = allocate_init_ports(
@@ -413,6 +479,9 @@ class Runtime:
413
479
  parent.wait(timeout=5)
414
480
  self.pid = None
415
481
 
482
+ def cache_prefix(self, prefix: str):
483
+ self.endpoint.cache_prefix(prefix)
484
+
416
485
  def get_tokenizer(self):
417
486
  return get_tokenizer(
418
487
  self.server_args.tokenizer_path,
sglang/srt/server_args.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """The arguments of the server."""
2
17
 
3
18
  import argparse
@@ -29,7 +44,8 @@ class ServerArgs:
29
44
  max_prefill_tokens: Optional[int] = None
30
45
  max_running_requests: Optional[int] = None
31
46
  max_num_reqs: Optional[int] = None
32
- schedule_heuristic: str = "lpm"
47
+ max_total_tokens: Optional[int] = None
48
+ schedule_policy: str = "lpm"
33
49
  schedule_conservativeness: float = 1.0
34
50
 
35
51
  # Other runtime options
@@ -45,11 +61,15 @@ class ServerArgs:
45
61
 
46
62
  # Other
47
63
  api_key: str = ""
64
+ file_storage_pth: str = "SGlang_storage"
48
65
 
49
66
  # Data parallelism
50
67
  dp_size: int = 1
51
68
  load_balance_method: str = "round_robin"
52
69
 
70
+ # Chunked Prefill
71
+ chunked_prefill_size: Optional[int] = None
72
+
53
73
  # Optimization/debug options
54
74
  disable_flashinfer: bool = False
55
75
  disable_flashinfer_sampling: bool = False
@@ -72,15 +92,15 @@ class ServerArgs:
72
92
  self.tokenizer_path = self.model_path
73
93
  if self.mem_fraction_static is None:
74
94
  if self.tp_size >= 16:
75
- self.mem_fraction_static = 0.80
95
+ self.mem_fraction_static = 0.79
76
96
  elif self.tp_size >= 8:
77
- self.mem_fraction_static = 0.84
97
+ self.mem_fraction_static = 0.83
78
98
  elif self.tp_size >= 4:
79
- self.mem_fraction_static = 0.86
99
+ self.mem_fraction_static = 0.85
80
100
  elif self.tp_size >= 2:
81
- self.mem_fraction_static = 0.88
101
+ self.mem_fraction_static = 0.87
82
102
  else:
83
- self.mem_fraction_static = 0.89
103
+ self.mem_fraction_static = 0.88
84
104
  if isinstance(self.additional_ports, int):
85
105
  self.additional_ports = [self.additional_ports]
86
106
  elif self.additional_ports is None:
@@ -176,6 +196,7 @@ class ServerArgs:
176
196
  "gptq",
177
197
  "marlin",
178
198
  "gptq_marlin",
199
+ "awq_marlin",
179
200
  "squeezellm",
180
201
  "bitsandbytes",
181
202
  ],
@@ -208,15 +229,21 @@ class ServerArgs:
208
229
  parser.add_argument(
209
230
  "--max-num-reqs",
210
231
  type=int,
211
- default=None,
232
+ default=ServerArgs.max_num_reqs,
212
233
  help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
213
234
  )
214
235
  parser.add_argument(
215
- "--schedule-heuristic",
236
+ "--max-total-tokens",
237
+ type=int,
238
+ default=ServerArgs.max_total_tokens,
239
+ help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
240
+ )
241
+ parser.add_argument(
242
+ "--schedule-policy",
216
243
  type=str,
217
- default=ServerArgs.schedule_heuristic,
244
+ default=ServerArgs.schedule_policy,
218
245
  choices=["lpm", "random", "fcfs", "dfs-weight"],
219
- help="The scheduling heuristic.",
246
+ help="The scheduling policy of the requests.",
220
247
  )
221
248
  parser.add_argument(
222
249
  "--schedule-conservativeness",
@@ -270,6 +297,12 @@ class ServerArgs:
270
297
  default=ServerArgs.api_key,
271
298
  help="Set API key of the server.",
272
299
  )
300
+ parser.add_argument(
301
+ "--file-storage-pth",
302
+ type=str,
303
+ default=ServerArgs.file_storage_pth,
304
+ help="The path of the file storage in backend.",
305
+ )
273
306
 
274
307
  # Data parallelism
275
308
  parser.add_argument(
@@ -296,10 +329,18 @@ class ServerArgs:
296
329
  help="The nccl init address of multi-node server.",
297
330
  )
298
331
  parser.add_argument(
299
- "--nnodes", type=int, default=1, help="The number of nodes."
332
+ "--nnodes", type=int, default=ServerArgs.nnodes, help="The number of nodes."
300
333
  )
301
334
  parser.add_argument("--node-rank", type=int, help="The node rank.")
302
335
 
336
+ # Chunked prefill
337
+ parser.add_argument(
338
+ "--chunked-prefill-size",
339
+ type=int,
340
+ default=ServerArgs.chunked_prefill_size,
341
+ help="The size of the chunked prefill.",
342
+ )
343
+
303
344
  # Optimization/debug options
304
345
  parser.add_argument(
305
346
  "--disable-flashinfer",
sglang/srt/utils.py CHANGED
@@ -1,3 +1,18 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+ """
15
+
1
16
  """Common utilities."""
2
17
 
3
18
  import base64
@@ -113,15 +113,14 @@ def test_decode_json_regex():
113
113
  s += ' "population": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
114
114
  s += ' "area": ' + sgl.gen(regex=REGEX_INT + ",") + "\n"
115
115
  s += ' "latitude": ' + sgl.gen(regex=REGEX_FLOAT + ",") + "\n"
116
- s += ' "country": ' + sgl.gen(regex=REGEX_STRING + ",") + "\n"
117
- s += ' "timezone": ' + sgl.gen(regex=REGEX_STRING) + "\n"
116
+ s += ' "country": ' + sgl.gen(regex=REGEX_STRING) + "\n"
118
117
  s += "}"
119
118
 
120
- ret = decode_json.run()
119
+ ret = decode_json.run(temperature=0.0)
121
120
  try:
122
121
  js_obj = json.loads(ret["json_output"])
123
122
  except json.decoder.JSONDecodeError:
124
- print(ret["json_output"])
123
+ print("JSONDecodeError", ret["json_output"])
125
124
  raise
126
125
  assert isinstance(js_obj["name"], str)
127
126
  assert isinstance(js_obj["population"], int)
@@ -141,8 +140,12 @@ def test_decode_json():
141
140
  s += ' "timezone": ' + sgl.gen(dtype=str) + "\n"
142
141
  s += "}"
143
142
 
144
- ret = decode_json.run()
145
- js_obj = json.loads(ret["json_output"])
143
+ ret = decode_json.run(max_new_tokens=64)
144
+ try:
145
+ js_obj = json.loads(ret["json_output"])
146
+ except json.decoder.JSONDecodeError:
147
+ print("JSONDecodeError", ret["json_output"])
148
+ raise
146
149
  assert isinstance(js_obj["name"], str)
147
150
  assert isinstance(js_obj["population"], int)
148
151
 
sglang/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Common utilities."""
2
2
 
3
3
  import base64
4
+ import importlib
4
5
  import json
5
6
  import logging
6
7
  import signal
@@ -261,3 +262,24 @@ def graceful_registry(sub_module_name):
261
262
  logger.info(f"{sub_module_name} recive sigterm")
262
263
 
263
264
  signal.signal(signal.SIGTERM, graceful_shutdown)
265
+
266
+
267
+ class LazyImport:
268
+ def __init__(self, module_name, class_name):
269
+ self.module_name = module_name
270
+ self.class_name = class_name
271
+ self._module = None
272
+
273
+ def _load(self):
274
+ if self._module is None:
275
+ module = importlib.import_module(self.module_name)
276
+ self._module = getattr(module, self.class_name)
277
+ return self._module
278
+
279
+ def __getattr__(self, name):
280
+ module = self._load()
281
+ return getattr(module, name)
282
+
283
+ def __call__(self, *args, **kwargs):
284
+ module = self._load()
285
+ return module(*args, **kwargs)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.6"
1
+ __version__ = "0.2.8"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -245,6 +245,13 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
245
245
 
246
246
  <div align="center">
247
247
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
248
+
249
+ [![PyPI](https://img.shields.io/pypi/v/sglang)](https://pypi.org/project/sglang)
250
+ ![PyPI - Downloads](https://img.shields.io/pypi/dm/sglang)
251
+ [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
252
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
253
+ [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
254
+
248
255
  </div>
249
256
 
250
257
  --------------------------------------------------------------------------------
@@ -292,7 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
292
299
 
293
300
  ### Method 2: From source
294
301
  ```
295
- git clone https://github.com/sgl-project/sglang.git
302
+ # Use the stable v0.2.8 branch
303
+ git clone -b v0.2.8 https://github.com/sgl-project/sglang.git
296
304
  cd sglang
297
305
 
298
306
  pip install --upgrade pip
@@ -304,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
304
312
 
305
313
  ### Method 3: Using docker
306
314
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
307
- Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
315
+ Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
308
316
 
309
317
  ```bash
310
318
  docker run --gpus all \
@@ -341,7 +349,7 @@ curl http://localhost:30000/generate \
341
349
  }
342
350
  }'
343
351
  ```
344
- Learn more about the argument format [here](docs/sampling_params.md).
352
+ Learn more about the argument format [here](docs/en/sampling_params.md).
345
353
 
346
354
  ### OpenAI Compatible API
347
355
  In addition, the server supports OpenAI-compatible APIs.
@@ -388,7 +396,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
388
396
  ```
389
397
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
390
398
  ```
391
- - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
399
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
392
400
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
393
401
  ```
394
402
  # Node 0
@@ -397,7 +405,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
397
405
  # Node 1
398
406
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
399
407
  ```
400
- - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
408
+ - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
401
409
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
402
410
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
403
411
 
@@ -440,7 +448,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
440
448
  - InternLM 2
441
449
  - Mistral NeMo
442
450
 
443
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
451
+ Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
444
452
 
445
453
  ### Benchmark Performance
446
454
 
@@ -671,6 +679,24 @@ for out in state.text_iter():
671
679
  print(out, end="", flush=True)
672
680
  ```
673
681
 
682
+ #### Roles
683
+
684
+ Use `sgl.system`, `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
685
+
686
+ ```python
687
+ @sgl.function
688
+ def chat_example(s):
689
+ s += sgl.system("You are a helpful assistant.")
690
+ # Same as: s += s.system("You are a helpful assistant.")
691
+
692
+ with s.user():
693
+ s += "Question: What is the capital of France?"
694
+
695
+ s += sgl.assistant_begin()
696
+ s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
697
+ s += sgl.assistant_end()
698
+ ```
699
+
674
700
  #### Tips and Implementation Details
675
701
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
676
702
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.